In [109]:
# Import libraries

# Data analysis and wrangling
import pandas as pd
import numpy as np

# For plots
import matplotlib.pyplot as plt
import seaborn as sns

# To have plot inline with jupyter notebook
% matplotlib inline

# Import Default dictionary
from collections import defaultdict

# Machine Learning models

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# K-Nearest Neighbour
from sklearn.neighbors import KNeighborsClassifier

# Naive-Bayes classifier
from sklearn.naive_bayes import GaussianNB

# Decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# Support Vector classifier
from sklearn.svm import NuSVC

# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier


# Cross validation
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score


In [110]:
# Load training and test dataset
df_titan_train = pd.read_csv("Titanic/df_train.csv")
df_titan_test = pd.read_csv("Titanic/df_test.csv")

In [111]:
# Get basic information from both the datasets
print('_'*40)
print('Training Dataset')
df_titan_train.info()
print('_'*40)
print('_'*40)
print('Test Dataset')
df_titan_test.info()

________________________________________
Training Dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 33 columns):
PassengerId               891 non-null int64
Survived                  891 non-null int64
Pclass_1                  891 non-null int64
Pclass_2                  891 non-null int64
Pclass_3                  891 non-null int64
Embarked_C                891 non-null int64
Embarked_Q                891 non-null int64
Embarked_S                891 non-null int64
Sex_title_female Miss.    891 non-null int64
Sex_title_female Mrs.     891 non-null int64
Sex_title_female rare     891 non-null int64
Sex_title_male Master.    891 non-null int64
Sex_title_male Mr.        891 non-null int64
Sex_title_male rare       891 non-null int64
Agegrp_0                  891 non-null int64
Agegrp_1                  891 non-null int64
Agegrp_2                  891 non-null int64
Agegrp_3                  891 non-null int64
Agegrp_4                  8

In [112]:
# Features and labels
features = df_titan_test.columns
label = 'Survived'
print("Features :{} \n\nLabels :{}".format(features,label))

Features :Index(['PassengerId', 'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C',
       'Embarked_Q', 'Embarked_S', 'Sex_title_female Miss.',
       'Sex_title_female Mrs.', 'Sex_title_female rare',
       'Sex_title_male Master.', 'Sex_title_male Mr.', 'Sex_title_male rare',
       'Agegrp_0', 'Agegrp_1', 'Agegrp_2', 'Agegrp_3', 'Agegrp_4', 'Agegrp_5',
       'Sibsize_0', 'Sibsize_1', 'Sibsize_2', 'Sibsize_3', 'Parsize_0',
       'Parsize_1', 'Parsize_2', 'Faregrp_0', 'Faregrp_1', 'Faregrp_2',
       'Faregrp_3', 'Faregrp_4', 'Faregrp_5'],
      dtype='object') 

Labels :Survived


In [113]:
# Features required for model
model_features = features[1:]
model_features

Index(['Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Sex_title_female Miss.', 'Sex_title_female Mrs.',
       'Sex_title_female rare', 'Sex_title_male Master.', 'Sex_title_male Mr.',
       'Sex_title_male rare', 'Agegrp_0', 'Agegrp_1', 'Agegrp_2', 'Agegrp_3',
       'Agegrp_4', 'Agegrp_5', 'Sibsize_0', 'Sibsize_1', 'Sibsize_2',
       'Sibsize_3', 'Parsize_0', 'Parsize_1', 'Parsize_2', 'Faregrp_0',
       'Faregrp_1', 'Faregrp_2', 'Faregrp_3', 'Faregrp_4', 'Faregrp_5'],
      dtype='object')

In [137]:
# Model - train, test and predict
def classifier(X, Y, mod, params, mode = 'train') :
    
    if mode == 'train' :
        if params == None :
            model = mod()
        else :
            model = mod(**params)
        Info = model.fit(X,Y)
    elif mode == 'test' or mode == 'predict' :
        model = mod
        Info = None
        
    Prediction = model.predict(X)
    
    if mode == 'train' or mode == 'test' :
        Accuracy = round(model.score(X,Y)*100,2)        
    elif mode == 'predict' :
        Accuracy = None
    
    return(Info, model, Prediction, Accuracy)    

In [138]:
# Models to test
models = {
          'Logistic Regression' : LogisticRegression ,
          'K-Nearest Neighbour' : KNeighborsClassifier ,
          'Naive Bayes' : GaussianNB ,
          'Decision Tree' : DecisionTreeClassifier ,
          'Support Vector Machine' : NuSVC ,
          'Random Forest Classifier' : RandomForestClassifier
         }

# Model parameters
model_params = {
          'Logistic Regression' : [None] ,
          'K-Nearest Neighbour' : [None, {'n_neighbors' : 3}, {'n_neighbors' : 5}] ,
          'Naive Bayes' : [None] ,
          'Decision Tree' : [None] ,
          'Support Vector Machine' : [None, {'kernel':'linear'}, {'kernel':'poly'}, {'kernel':'sigmoid'}] ,
          'Random Forest Classifier' : [None, {'n_estimators':50},{'n_estimators':100},{'n_estimators':500}]
                }

In [139]:
# Train all the models
X_train = df_titan_train[model_features]
Y_train = df_titan_train[label]
X_test = df_titan_test[model_features]
Info = defaultdict(dict); Model =defaultdict(dict); Train_prediction = defaultdict(dict)
Train_acc = defaultdict(dict);  parameters = defaultdict(dict)
for name, mod in models.items() :    
    for key, params in enumerate(model_params[name]) :             
        parameters[name][key] = params
        Info[name][key], Model[name][key], Train_prediction[name][key], Train_acc[name][key], \
                    = classifier(X_train, Y_train, mod, params, mode ='train')            
        

In [140]:
# Store the accuracy of the model
i=0
df_model_score = pd.DataFrame(columns = ['Name', 'key','Parameters','Info','Train_acc'])
for name, _ in models.items() :    
    for key, params in enumerate(model_params[name]) :      
        df_model_score.loc[i] = [name,key,params,Info[name][key],Train_acc[name][key]]
        i+=1
                                  
# Print the output in the order of Training accuracy
for index, row in df_model_score.sort_values(['Train_acc'],ascending=False).iterrows() :
    print('*'*90)
    print("Model \t\t\t\t : ",row[0])
    print("Parameters passed \t\t : ",row[2])
    print('Training Accuracy \t\t : ',row[4],'%')
    print('Model info \t\t\t : ',row[3])


******************************************************************************************
Model 				 :  Random Forest Classifier
Parameters passed 		 :  {'n_estimators': 50}
Training Accuracy 		 :  89.23 %
Model info 			 :  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
******************************************************************************************
Model 				 :  Random Forest Classifier
Parameters passed 		 :  {'n_estimators': 100}
Training Accuracy 		 :  89.23 %
Model info 			 :  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max

In [142]:
# Se all the output in tabular form (in the order of training data accuracy)
df_model_score.sort_values(['Train_acc'],ascending=False).iloc[:,[0,1,2,4]]

Unnamed: 0,Name,key,Parameters,Train_acc
8,Random Forest Classifier,1,{'n_estimators': 50},89.23
9,Random Forest Classifier,2,{'n_estimators': 100},89.23
10,Random Forest Classifier,3,{'n_estimators': 500},89.23
12,Decision Tree,0,,89.23
7,Random Forest Classifier,0,,88.66
1,K-Nearest Neighbour,1,{'n_neighbors': 3},85.19
0,K-Nearest Neighbour,0,,83.84
2,K-Nearest Neighbour,2,{'n_neighbors': 5},83.84
5,Support Vector Machine,2,{'kernel': 'poly'},83.73
13,Logistic Regression,0,,82.94


In [149]:
# Lets pick 5 models 
Indexes = [8,12,2,5,13] 
df_model_score.loc[Indexes,['Name']]

Unnamed: 0,Name
8,Random Forest Classifier
12,Decision Tree
2,K-Nearest Neighbour
5,Support Vector Machine
13,Logistic Regression


In [None]:
# We will now do cross validation
Info = {}; Model ={}; Train_prediction = {}; Train_acc = {};  parameters = {}
kfld = StratifiedKFold(n_splits=10,random_state=123)
for trcv_index, tscv_index in skf.split(X_train, Y_train):
    X_trcv, X_tscv = X_train[trcv_index], X_train[tscv_index]
    Y_trcv, y_tscv = Y_train[trcv_index], Y_train[tscv_index]
    for name in Indexes :    
        Info[name], Model[name], Train_prediction[name], Train_acc[name], \
                        = classifier(X_trcv, Y_trcv, Model[name], parameters[name],mode='train')            

    

In [None]:
final_prediction = np.zeros(shape=X_train.shape[0])
for ind in Indexes :
    final_prediction += Train_prediction[ind]/len(Indexes)

final_prediction=1*(final_prediction > 0.5)
sum(final_prediction == Y_train)/Y_train.shape[0]

In [128]:
parameters[][]

SyntaxError: invalid syntax (<ipython-input-128-2c7f2e128275>, line 1)