In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [5]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,1,0.83,0,2,29.0,0,0,1
1,1,3,1,29.0,0,0,9.5,0,0,1
2,1,3,1,6.0,0,1,12.475,0,0,1
3,0,1,1,36.0,1,0,78.85,0,0,1
4,0,3,1,4.0,4,2,31.275,0,0,1


In [7]:
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [8]:
result_dict = {}

In [10]:
def summarize_classification(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'accuracy': acc,
            'precision': prec,
            'recall': recall,
            'accuracy_count': num_acc}

In [12]:
def build_model(classifier_fn,
                names_of_y_col,
                names_of_x_cols,
                dataset,
                test_frac=0.2):
    X = dataset[names_of_x_cols]
    Y = dataset[names_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    model_crosstab = pd.crosstab(y_pred, y_test)
    
    return {'training': train_summary,
            'test': test_summary,
            'confusion_matrix': model_crosstab}

In [13]:
def compare_results(result_dict):
    for key in result_dict:
        print('Classification: ', key)
        
        print()
        print('Training data')
        
        for item, score in result_dict[key]['training'].items():
            print(item, score)
            
        print()
        print('Test data')
        for item, score in result_dict[key]['test'].items():
            print(item, score)
            
        print()
        
        

In [14]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    return model

In [15]:
result_dict['survived_logistic'] = build_model(logistic_fn, 'Survived', FEATURES, titanic_df)
compare_results(result_dict)

Classification:  survived_logistic

Training data
accuracy 0.7996485061511424
precision 0.7788944723618091
recall 0.6888888888888889
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.8181818181818182
recall 0.7142857142857143
accuracy_count 115



In [18]:
def linear_descriminant_fn(x_train, y_train, solver='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [19]:
result_dict['survived_linear_discriminant'] = build_model(linear_descriminant_fn, 'Survived', FEATURES, titanic_df)
compare_results(result_dict)

Classification:  survived_logistic

Training data
accuracy 0.7996485061511424
precision 0.7788944723618091
recall 0.6888888888888889
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.8181818181818182
recall 0.7142857142857143
accuracy_count 115

Classification:  survived_linear_discriminant

Training data
accuracy 0.8084358523725835
precision 0.7870370370370371
recall 0.7296137339055794
accuracy_count 460

Test data
accuracy 0.7692307692307693
precision 0.7291666666666666
recall 0.6363636363636364
accuracy_count 110



In [21]:
def quadratic_discriminant_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [23]:
result_dict['survived_quadratic_discriminant'] = build_model(quadratic_discriminant_fn, 'Survived', FEATURES[0:-1], titanic_df)
compare_results(result_dict)

Classification:  survived_logistic

Training data
accuracy 0.7996485061511424
precision 0.7788944723618091
recall 0.6888888888888889
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.8181818181818182
recall 0.7142857142857143
accuracy_count 115

Classification:  survived_linear_discriminant

Training data
accuracy 0.8084358523725835
precision 0.7870370370370371
recall 0.7296137339055794
accuracy_count 460

Test data
accuracy 0.7692307692307693
precision 0.7291666666666666
recall 0.6363636363636364
accuracy_count 110

Classification:  survived_quadratic_discriminant

Training data
accuracy 0.7996485061511424
precision 0.7788461538461539
recall 0.7043478260869566
accuracy_count 455

Test data
accuracy 0.8251748251748252
precision 0.8113207547169812
recall 0.7413793103448276
accuracy_count 118



In [30]:
def sgd_fn(x_train, y_train, max_iter=10000, tol=1e-3):
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
    
    return model

In [31]:
result_dict['survived_sgd'] = build_model(sgd_fn, 'Survived', FEATURES, titanic_df)
compare_results(result_dict)

Classification:  survived_logistic

Training data
accuracy 0.7996485061511424
precision 0.7788944723618091
recall 0.6888888888888889
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.8181818181818182
recall 0.7142857142857143
accuracy_count 115

Classification:  survived_linear_discriminant

Training data
accuracy 0.8084358523725835
precision 0.7870370370370371
recall 0.7296137339055794
accuracy_count 460

Test data
accuracy 0.7692307692307693
precision 0.7291666666666666
recall 0.6363636363636364
accuracy_count 110

Classification:  survived_quadratic_discriminant

Training data
accuracy 0.7996485061511424
precision 0.7788461538461539
recall 0.7043478260869566
accuracy_count 455

Test data
accuracy 0.8251748251748252
precision 0.8113207547169812
recall 0.7413793103448276
accuracy_count 118

Classification:  survived_sgd

Training data
accuracy 0.6854130052724078
precision 0.8295454545454546
recall 0.3080168776371308
accuracy_count 390

Test data
accuracy 0.71328671

C is inverse of regularization strength, smaller values indicate stronger regularization - penalize points on the wrong side of the margin. C is to penalize complex model
LinearSVC == SVC(kelnel="linear")
prefer dual=False when n_samples > n_features

In [32]:
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    
    return model

In [33]:
result_dict['survived_linear_SVC'] = build_model(linear_svc_fn, 'Survived', FEATURES, titanic_df)
compare_results(result_dict)

Classification:  survived_logistic

Training data
accuracy 0.7996485061511424
precision 0.7788944723618091
recall 0.6888888888888889
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.8181818181818182
recall 0.7142857142857143
accuracy_count 115

Classification:  survived_linear_discriminant

Training data
accuracy 0.8084358523725835
precision 0.7870370370370371
recall 0.7296137339055794
accuracy_count 460

Test data
accuracy 0.7692307692307693
precision 0.7291666666666666
recall 0.6363636363636364
accuracy_count 110

Classification:  survived_quadratic_discriminant

Training data
accuracy 0.7996485061511424
precision 0.7788461538461539
recall 0.7043478260869566
accuracy_count 455

Test data
accuracy 0.8251748251748252
precision 0.8113207547169812
recall 0.7413793103448276
accuracy_count 118

Classification:  survived_sgd

Training data
accuracy 0.6854130052724078
precision 0.8295454545454546
recall 0.3080168776371308
accuracy_count 390

Test data
accuracy 0.71328671

In [34]:
def radius_neighbor_fn(x_train, y_train, radius=40.0):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    
    return model

In [35]:
result_dict['survived_radius_neighbors'] = build_model(radius_neighbor_fn, 'Survived', FEATURES, titanic_df)
compare_results(result_dict)

Classification:  survived_logistic

Training data
accuracy 0.7996485061511424
precision 0.7788944723618091
recall 0.6888888888888889
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.8181818181818182
recall 0.7142857142857143
accuracy_count 115

Classification:  survived_linear_discriminant

Training data
accuracy 0.8084358523725835
precision 0.7870370370370371
recall 0.7296137339055794
accuracy_count 460

Test data
accuracy 0.7692307692307693
precision 0.7291666666666666
recall 0.6363636363636364
accuracy_count 110

Classification:  survived_quadratic_discriminant

Training data
accuracy 0.7996485061511424
precision 0.7788461538461539
recall 0.7043478260869566
accuracy_count 455

Test data
accuracy 0.8251748251748252
precision 0.8113207547169812
recall 0.7413793103448276
accuracy_count 118

Classification:  survived_sgd

Training data
accuracy 0.6854130052724078
precision 0.8295454545454546
recall 0.3080168776371308
accuracy_count 390

Test data
accuracy 0.71328671

In [36]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None):
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    
    return model

In [37]:
result_dict['survived_decision_tree'] = build_model(decision_tree_fn, 'Survived', FEATURES, titanic_df)
compare_results(result_dict)

Classification:  survived_logistic

Training data
accuracy 0.7996485061511424
precision 0.7788944723618091
recall 0.6888888888888889
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.8181818181818182
recall 0.7142857142857143
accuracy_count 115

Classification:  survived_linear_discriminant

Training data
accuracy 0.8084358523725835
precision 0.7870370370370371
recall 0.7296137339055794
accuracy_count 460

Test data
accuracy 0.7692307692307693
precision 0.7291666666666666
recall 0.6363636363636364
accuracy_count 110

Classification:  survived_quadratic_discriminant

Training data
accuracy 0.7996485061511424
precision 0.7788461538461539
recall 0.7043478260869566
accuracy_count 455

Test data
accuracy 0.8251748251748252
precision 0.8113207547169812
recall 0.7413793103448276
accuracy_count 118

Classification:  survived_sgd

Training data
accuracy 0.6854130052724078
precision 0.8295454545454546
recall 0.3080168776371308
accuracy_count 390

Test data
accuracy 0.71328671

In [38]:
def naive_bayes_fn(x_train, y_train, priors=None):
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    
    return model

In [39]:
result_dict['survived_naive_bayes'] = build_model(naive_bayes_fn, 'Survived', FEATURES, titanic_df)
compare_results(result_dict)

Classification:  survived_logistic

Training data
accuracy 0.7996485061511424
precision 0.7788944723618091
recall 0.6888888888888889
accuracy_count 455

Test data
accuracy 0.8041958041958042
precision 0.8181818181818182
recall 0.7142857142857143
accuracy_count 115

Classification:  survived_linear_discriminant

Training data
accuracy 0.8084358523725835
precision 0.7870370370370371
recall 0.7296137339055794
accuracy_count 460

Test data
accuracy 0.7692307692307693
precision 0.7291666666666666
recall 0.6363636363636364
accuracy_count 110

Classification:  survived_quadratic_discriminant

Training data
accuracy 0.7996485061511424
precision 0.7788461538461539
recall 0.7043478260869566
accuracy_count 455

Test data
accuracy 0.8251748251748252
precision 0.8113207547169812
recall 0.7413793103448276
accuracy_count 118

Classification:  survived_sgd

Training data
accuracy 0.6854130052724078
precision 0.8295454545454546
recall 0.3080168776371308
accuracy_count 390

Test data
accuracy 0.71328671