In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_data = pd.read_csv('datasets/titanic_train_processes.csv')
titanic_data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,C,Q,S
0,0,2,1,52.0,0,0,13.5,0,0,1
1,0,1,1,24.0,0,0,79.2,1,0,0
2,1,1,0,36.0,1,2,120.0,0,0,1
3,0,1,1,28.0,0,0,47.1,0,0,1
4,0,3,1,36.0,0,0,7.8958,0,0,1


In [3]:
features = list(titanic_data.columns[1:])
features

['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'C', 'Q', 'S']

In [4]:
result_dict = {}

In [5]:
def summary(y_test, y_pred):
    #This value will be between 0.0 and 1.0
    acc = accuracy_score(y_test, y_pred, normalize=True) 
    #The number of correctly predicted samples (raw count)
    acc_num = accuracy_score(y_test, y_pred, normalize=False) 

    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)

    return{ 'accuracy' : acc, 
            'precision' : precision,
            'recall': recall,
            'accuracy_count': acc_num
    }

In [6]:
def build_model(classifier_name,
                name_of_y_col,
                names_of_x_cols,
                dataset,
                test_frac = 0.2):
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)

    model = classifier_name(x_train, y_train)

    y_pred = model.predict(x_test)

    y_pred_train = model.predict(x_train)

    train_summary = summary(y_train, y_pred_train)
    test_summary = summary(y_test, y_pred)

    pred_results  = pd.DataFrame({'y_test':y_test,  
                                  'y_pred':y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)

    return {
        'training': train_summary,
        'test':test_summary,
        'confusion_matrix': model_crosstab
    }

In [7]:
def compare_results():
    for key in result_dict:
        print('Classification :', key)

        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])

        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])

        print()

In [8]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)

    return model

In [9]:
# LDA: find axes to best separate the classes such that all instances of a class are in same quadrant.
# SVD: singular value decomposition estimator: finds axes without calculating the covariance matrix of features, 
# useful when we have many features or many rows in dataset.
def linear_discriminant(x_train, y_train, solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)

    return model

In [10]:
# QDA: fins axes to best separate the classes such that all instances of a class are in the same quadrant
# but the decision boundary is quadratic.
# may not be straight line or hyper plane as LDA.
# useful when the X variables corresponding to different labels have diferent covarience.
# i.e., the covariance are different for X for all values of Y.
def quadratic_discriminant(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [11]:
# SGD : numerical optimization techniques.
# one training instance, one record from training dataset at a time to find the best model parameters.

def sgd(x_train, y_train, max_iter=10000, tol=1e-3):
    #tol : tolerance = 0.001
    #the training process will stop when the change in the loss function becomes smaller than 0.001
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)

    return model

In [12]:
#SVM : Linear SVC

def svm(x_train, y_train, C=1.0, max_iter= 1000, tol = 1e-3):
    # C: regularization strength, 
    # It controls the trade-off between maximizing the margin and minimizing the classification error.
    '''
    A smaller C value encourages a larger margin but may result in a larger number of misclassified points, 
    while a larger C value allows for fewer misclassifications but may lead to a smaller margin. 
    '''
    # dual = True: when #features > #samples.
    # dual = False :primal problem formulation: when #features < #samples.
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)

    return model

In [13]:
# KNN

def radius_neighbor(x_train, y_train, radius = 40.0):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)

    return model

In [14]:
# DT

def decision_tree(x_train, y_train, max_depth = None, max_features = None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)

    return model

In [15]:
# NB

def naive_bayes(x_train, y_train, priors= None):
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)

    return model

In [16]:
result_dict['survived - logistic'] = build_model(logistic_fn, 'Survived', features, titanic_data)
result_dict['survived - LDA'] = build_model(linear_discriminant, 'Survived', features, titanic_data)
#features[0:-1] - to fix the colinearity warning, due to dummy trap.
result_dict['survived - QDA'] = build_model(quadratic_discriminant, 'Survived', features[0:-1], titanic_data)
result_dict['survived - SGD'] = build_model(sgd, 'Survived', features, titanic_data)
result_dict['survived - SVM'] = build_model(svm, 'Survived', features, titanic_data)
result_dict['survived - R_neighbor'] = build_model(radius_neighbor, 'Survived', features, titanic_data)
result_dict['survived - Decision Tree'] = build_model(decision_tree, 'Survived', features, titanic_data)
result_dict['survived - NB'] = build_model(naive_bayes, 'Survived', features, titanic_data)

compare_results()

Classification : survived - logistic

Training data
accuracy 0.809106830122592
precision 0.8
recall 0.6986899563318777
accuracy_count 462

Test data
accuracy 0.7552447552447552
precision 0.7241379310344828
recall 0.6885245901639344
accuracy_count 108

Classification : survived - LDA

Training data
accuracy 0.8021015761821366
precision 0.7761904761904762
recall 0.7117903930131004
accuracy_count 458

Test data
accuracy 0.7692307692307693
precision 0.7916666666666666
recall 0.6229508196721312
accuracy_count 110

Classification : survived - QDA

Training data
accuracy 0.7915936952714536
precision 0.7703349282296651
recall 0.6939655172413793
accuracy_count 452

Test data
accuracy 0.8251748251748252
precision 0.851063829787234
recall 0.6896551724137931
accuracy_count 118

Classification : survived - SGD

Training data
accuracy 0.7390542907180385
precision 0.7801418439716312
recall 0.4824561403508772
accuracy_count 422

Test data
accuracy 0.7202797202797203
precision 0.7894736842105263
recall