In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [2]:
titanic_df = pd.read_csv('datasets/titanic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,3,1,26.0,2,0,8.6625,0,0,1
1,0,1,1,31.0,1,0,52.0,0,0,1
2,1,2,1,62.0,0,0,10.5,0,0,1
3,0,2,1,29.0,1,0,27.7208,1,0,0
4,0,2,1,24.0,0,0,13.0,0,0,1


In [3]:
FEATURES = list(titanic_df.columns[1:])

In [4]:
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [5]:
result_dict = {}

In [6]:
def summarize_classification (y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'accuracy': acc, 
            'precision': prec, 
            'recall' : recall,
            'accuracy_count': num_acc}

In [7]:
def build_model(classifier_fn, 
                name_of_y_col,
                names_of_x_cols,
                dataset,
                test_frac = 0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = test_frac)
    
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({ 'y_test': y_test,
                                 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {'training': train_summary,
            'test': test_summary,
            'confusion_matrix': model_crosstab}

In [8]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)
        
        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
                
        print()

In [9]:
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [10]:
result_dict['survived - logistic'] = build_model(logistic_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7978910369068541
precision 0.7760416666666666
recall 0.6742081447963801
accuracy_count 454

Test data
accuracy 0.7902097902097902
precision 0.8032786885245902
recall 0.7313432835820896
accuracy_count 113



In [11]:
def liner_discriminant_fn(x_train, y_train, solver='svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [12]:
result_dict['survived - liner_discriminant_analysis'] = build_model(liner_discriminant_fn,
                                                                    'Survived',
                                                                    FEATURES,
                                                                    titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7978910369068541
precision 0.7760416666666666
recall 0.6742081447963801
accuracy_count 454

Test data
accuracy 0.7902097902097902
precision 0.8032786885245902
recall 0.7313432835820896
accuracy_count 113

Classification:  survived - liner_discriminant_analysis

Training data
accuracy 0.7978910369068541
precision 0.7783018867924528
recall 0.7081545064377682
accuracy_count 454

Test data
accuracy 0.7972027972027972
precision 0.7407407407407407
recall 0.7272727272727273
accuracy_count 114





In [13]:
result_dict['survived - liner_discriminant_analysis'] = build_model(liner_discriminant_fn,
                                                                    'Survived',
                                                                    FEATURES[0:-1],
                                                                    titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7978910369068541
precision 0.7760416666666666
recall 0.6742081447963801
accuracy_count 454

Test data
accuracy 0.7902097902097902
precision 0.8032786885245902
recall 0.7313432835820896
accuracy_count 113

Classification:  survived - liner_discriminant_analysis

Training data
accuracy 0.7908611599297012
precision 0.7594339622641509
recall 0.7030567685589519
accuracy_count 450

Test data
accuracy 0.7972027972027972
precision 0.8
recall 0.6779661016949152
accuracy_count 114



In [14]:
def sgd_fn(x_train, y_train, max_iter=100000, tol=1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
    
    return model

In [15]:
result_dict['survived - sgd'] = build_model(sgd_fn,
                                            'Survived',
                                            FEATURES,
                                            titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7978910369068541
precision 0.7760416666666666
recall 0.6742081447963801
accuracy_count 454

Test data
accuracy 0.7902097902097902
precision 0.8032786885245902
recall 0.7313432835820896
accuracy_count 113

Classification:  survived - liner_discriminant_analysis

Training data
accuracy 0.7908611599297012
precision 0.7594339622641509
recall 0.7030567685589519
accuracy_count 450

Test data
accuracy 0.7972027972027972
precision 0.8
recall 0.6779661016949152
accuracy_count 114

Classification:  survived - sgd

Training data
accuracy 0.5940246045694201
precision 0.6363636363636364
recall 0.05907172995780591
accuracy_count 338

Test data
accuracy 0.6573426573426573
precision 0.6666666666666666
recall 0.0784313725490196
accuracy_count 94



In [16]:
def linear_svc_fn(x_train, y_train, C=1.0, max_iter=1000, tol=1e-3):
    
    model = LinearSVC(C=C, max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
    
    return model

In [17]:
result_dict['survived - linear_svc'] = build_model(linear_svc_fn,
                                                   'Survived',
                                                    FEATURES,
                                                    titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7978910369068541
precision 0.7760416666666666
recall 0.6742081447963801
accuracy_count 454

Test data
accuracy 0.7902097902097902
precision 0.8032786885245902
recall 0.7313432835820896
accuracy_count 113

Classification:  survived - liner_discriminant_analysis

Training data
accuracy 0.7908611599297012
precision 0.7594339622641509
recall 0.7030567685589519
accuracy_count 450

Test data
accuracy 0.7972027972027972
precision 0.8
recall 0.6779661016949152
accuracy_count 114

Classification:  survived - sgd

Training data
accuracy 0.5940246045694201
precision 0.6363636363636364
recall 0.05907172995780591
accuracy_count 338

Test data
accuracy 0.6573426573426573
precision 0.6666666666666666
recall 0.0784313725490196
accuracy_count 94

Classification:  survived - linear_svc

Training data
accuracy 0.7803163444639719
precision 0.7549019607843137
recall 0.6724890829694323
accuracy_count 444

Test data
accuracy 0.8811188811188811
pr

In [18]:
def radius_neighbor_fn(x_train, y_train, radius=40.0):
    
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
    
    return model

In [19]:
result_dict['survived - radius_neighbors'] = build_model (radius_neighbor_fn,
                                                          'Survived',
                                                          FEATURES,
                                                          titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7978910369068541
precision 0.7760416666666666
recall 0.6742081447963801
accuracy_count 454

Test data
accuracy 0.7902097902097902
precision 0.8032786885245902
recall 0.7313432835820896
accuracy_count 113

Classification:  survived - liner_discriminant_analysis

Training data
accuracy 0.7908611599297012
precision 0.7594339622641509
recall 0.7030567685589519
accuracy_count 450

Test data
accuracy 0.7972027972027972
precision 0.8
recall 0.6779661016949152
accuracy_count 114

Classification:  survived - sgd

Training data
accuracy 0.5940246045694201
precision 0.6363636363636364
recall 0.05907172995780591
accuracy_count 338

Test data
accuracy 0.6573426573426573
precision 0.6666666666666666
recall 0.0784313725490196
accuracy_count 94

Classification:  survived - linear_svc

Training data
accuracy 0.7803163444639719
precision 0.7549019607843137
recall 0.6724890829694323
accuracy_count 444

Test data
accuracy 0.8811188811188811
pr

In [20]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None):
    
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    
    return model

In [21]:
result_dict['survived - decision_tree'] = build_model ( decision_tree_fn,
                                                        'Survived',
                                                       FEATURES,
                                                        titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7978910369068541
precision 0.7760416666666666
recall 0.6742081447963801
accuracy_count 454

Test data
accuracy 0.7902097902097902
precision 0.8032786885245902
recall 0.7313432835820896
accuracy_count 113

Classification:  survived - liner_discriminant_analysis

Training data
accuracy 0.7908611599297012
precision 0.7594339622641509
recall 0.7030567685589519
accuracy_count 450

Test data
accuracy 0.7972027972027972
precision 0.8
recall 0.6779661016949152
accuracy_count 114

Classification:  survived - sgd

Training data
accuracy 0.5940246045694201
precision 0.6363636363636364
recall 0.05907172995780591
accuracy_count 338

Test data
accuracy 0.6573426573426573
precision 0.6666666666666666
recall 0.0784313725490196
accuracy_count 94

Classification:  survived - linear_svc

Training data
accuracy 0.7803163444639719
precision 0.7549019607843137
recall 0.6724890829694323
accuracy_count 444

Test data
accuracy 0.8811188811188811
pr

In [22]:
def naive_bayes_fn(x_train, y_train, priors=None):
    
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    
    return model

In [23]:
result_dict['survived - naive_bayes'] = build_model ( naive_bayes_fn,
                                                     'Survived',
                                                     FEATURES,
                                                     titanic_df)

compare_results()

Classification:  survived - logistic

Training data
accuracy 0.7978910369068541
precision 0.7760416666666666
recall 0.6742081447963801
accuracy_count 454

Test data
accuracy 0.7902097902097902
precision 0.8032786885245902
recall 0.7313432835820896
accuracy_count 113

Classification:  survived - liner_discriminant_analysis

Training data
accuracy 0.7908611599297012
precision 0.7594339622641509
recall 0.7030567685589519
accuracy_count 450

Test data
accuracy 0.7972027972027972
precision 0.8
recall 0.6779661016949152
accuracy_count 114

Classification:  survived - sgd

Training data
accuracy 0.5940246045694201
precision 0.6363636363636364
recall 0.05907172995780591
accuracy_count 338

Test data
accuracy 0.6573426573426573
precision 0.6666666666666666
recall 0.0784313725490196
accuracy_count 94

Classification:  survived - linear_svc

Training data
accuracy 0.7803163444639719
precision 0.7549019607843137
recall 0.6724890829694323
accuracy_count 444

Test data
accuracy 0.8811188811188811
pr