In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import  RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [5]:
titanic_df = pd.read_csv('datasets/titnaic_processed.csv')
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,0,28.0,1,0,26.0,0,0,1
1,0,1,1,45.0,1,0,83.475,0,0,1
2,1,3,1,27.0,0,0,7.7958,0,0,1
3,1,3,0,36.0,1,0,17.4,0,0,1
4,1,2,1,3.0,1,1,26.0,0,0,1


In [6]:
FEATURES = list(titanic_df.columns[1:])
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [7]:
result_dict = {}

In [8]:
def classification_summary(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize = True)
    num_acc = accuracy_score(y_test, y_pred, normalize = False)
    prec = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
      
    return {
       "Acc: ": acc,
       "prec: ": prec,
       "recall: ": recall,
       "Acc Count: ": num_acc
    }
    

In [9]:
def build_model(classifier_fn, name_of_y_col, name_of_x_cols, dataset, test_frac = 0.2):
    X = dataset[name_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2)
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_results = classification_summary(y_train, y_pred_train)
    test_results = classification_summary(y_test, y_pred )
    
   
    pred_results = pd.DataFrame({'y_test': y_test, 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
     
    return {
        'training': train_results,
        'test': test_results,
        'Confusion Matrix': model_crosstab
    }

In [10]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)
        
        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
        
        print()

In [11]:
def logistic_fn(x_train, y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model 

In [12]:
result_dict['survived - logistic'] = build_model(logistic_fn,'Survived', FEATURES, titanic_df)
compare_results() 


Classification:  survived - logistic

Training data
Acc:  0.81195079086116
prec:  0.788659793814433
recall:  0.6986301369863014
Acc Count:  462

Test data
Acc:  0.7902097902097902
prec:  0.9148936170212766
recall:  0.6231884057971014
Acc Count:  113



In [13]:
def linear_discriminant_fn(x_train, y_train, solver ='svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [15]:
result_dict['survived - linear_discriminant_fn'] = build_model(linear_discriminant_fn,'Survived', FEATURES[0:-1], titanic_df)
compare_results() 

Classification:  survived - logistic

Training data
Acc:  0.81195079086116
prec:  0.788659793814433
recall:  0.6986301369863014
Acc Count:  462

Test data
Acc:  0.7902097902097902
prec:  0.9148936170212766
recall:  0.6231884057971014
Acc Count:  113

Classification:  survived - linear_discriminant_fn

Training data
Acc:  0.789103690685413
prec:  0.7647058823529411
recall:  0.6842105263157895
Acc Count:  449

Test data
Acc:  0.8251748251748252
prec:  0.8070175438596491
recall:  0.7666666666666667
Acc Count:  118



In [18]:
def quadratic_discriminant_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

In [19]:
result_dict['survived - quadratic_discriminant_fn'] = build_model(quadratic_discriminant_fn,'Survived', FEATURES[0:-1], titanic_df)
compare_results() 

Classification:  survived - logistic

Training data
Acc:  0.81195079086116
prec:  0.788659793814433
recall:  0.6986301369863014
Acc Count:  462

Test data
Acc:  0.7902097902097902
prec:  0.9148936170212766
recall:  0.6231884057971014
Acc Count:  113

Classification:  survived - linear_discriminant_fn

Training data
Acc:  0.789103690685413
prec:  0.7647058823529411
recall:  0.6842105263157895
Acc Count:  449

Test data
Acc:  0.8251748251748252
prec:  0.8070175438596491
recall:  0.7666666666666667
Acc Count:  118

Classification:  survived - quadratic_discriminant_fn

Training data
Acc:  0.8101933216168717
prec:  0.7857142857142857
recall:  0.7236842105263158
Acc Count:  461

Test data
Acc:  0.7622377622377622
prec:  0.7407407407407407
recall:  0.6666666666666666
Acc Count:  109



In [21]:
def linear_svc_fn(x_train, y_train, C = 1.0 max_iter = 1000, tol =le-3):
    model = LinearSVC(C=C, max_iter=max, tol=tol)
    model.fit(x_train, y_train)
    
    return model

In [23]:
result_dict['survived - sgd_fn'] = build_model(sgd_fn,'Survived', FEATURES, titanic_df)
compare_results() 

TypeError: '<=' not supported between instances of 'builtin_function_or_method' and 'int'