In [160]:
# Import libraries

# Data analysis and wrangling
import pandas as pd

# For plots
import matplotlib.pyplot as plt
import seaborn as sns

# To have plot inline with jupyter notebook
% matplotlib inline



# Machine Learning models

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# K-Nearest Neighbour
from sklearn.neighbors import KNeighborsClassifier

# Naive-Bayes classifier
from sklearn.naive_bayes import GaussianNB

# Decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# Support Vector classifier
from sklearn.svm import NuSVC

# Random Forest classifier
from sklearn.ensemble import RandomForestClassifier


In [161]:
# Load training and test dataset
df_titan_train = pd.read_csv("Titanic/df_train.csv")
df_titan_test = pd.read_csv("Titanic/df_test.csv")

In [162]:
# Get basic information from both the datasets
print('_'*40)
print('Training Dataset')
df_titan_train.info()
print('_'*40)
print('_'*40)
print('Test Dataset')
df_titan_test.info()

________________________________________
Training Dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 31 columns):
PassengerId               891 non-null int64
Survived                  891 non-null int64
Pclass                    891 non-null int64
SibSp                     891 non-null int64
Parch                     891 non-null int64
Agegrp                    891 non-null int64
Faregrp                   891 non-null int64
Pclass_1                  891 non-null int64
Pclass_2                  891 non-null int64
Pclass_3                  891 non-null int64
Embarked_C                891 non-null int64
Embarked_Q                891 non-null int64
Embarked_S                891 non-null int64
Sex_title_female Miss.    891 non-null int64
Sex_title_female Mrs.     891 non-null int64
Sex_title_female rare     891 non-null int64
Sex_title_male Master.    891 non-null int64
Sex_title_male Mr.        891 non-null int64
Sex_title_male rare       8

In [163]:
# Features and labels
features = df_titan_test.columns
label = 'Survived'
print("Features :{} \n\nLabels :{}".format(features,label))

Features :Index(['PassengerId', 'Pclass', 'SibSp', 'Parch', 'Agegrp', 'Faregrp',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Sex_title_female Miss.', 'Sex_title_female Mrs.',
       'Sex_title_female rare', 'Sex_title_male Master.', 'Sex_title_male Mr.',
       'Sex_title_male rare', 'Agegrp_0', 'Agegrp_1', 'Agegrp_2', 'Agegrp_3',
       'Agegrp_4', 'Agegrp_5', 'Faregrp_0', 'Faregrp_1', 'Faregrp_2',
       'Faregrp_3', 'Faregrp_4', 'Faregrp_5'],
      dtype='object') 

Labels :Survived


In [164]:
# Features required for model
model_features = features[1:]
model_features

Index(['Pclass', 'SibSp', 'Parch', 'Agegrp', 'Faregrp', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Sex_title_female Miss.', 'Sex_title_female Mrs.',
       'Sex_title_female rare', 'Sex_title_male Master.', 'Sex_title_male Mr.',
       'Sex_title_male rare', 'Agegrp_0', 'Agegrp_1', 'Agegrp_2', 'Agegrp_3',
       'Agegrp_4', 'Agegrp_5', 'Faregrp_0', 'Faregrp_1', 'Faregrp_2',
       'Faregrp_3', 'Faregrp_4', 'Faregrp_5'],
      dtype='object')

In [165]:
# Model training and accuracy
def train_model(X_train, X_test, Y_train, mod, params) :
    if params == None :
        model = mod()
    else :
        model = mod(**params)
    Info = model.fit(X_train,Y_train)
    Train_prediction = model.predict(X_train)
    Train_acc = round(model.score(X_train,Y_train)*100,2)
    Test_prediction = model.predict(X_test)
    return(Info, Train_prediction, Train_acc,Test_prediction)
    

In [166]:
# Models to test
models = {
          'Logistic Regression' : LogisticRegression ,
          'K-Nearest Neighbour' : KNeighborsClassifier ,
          'Naive Bayes' : GaussianNB ,
          'Decision Tree' : DecisionTreeClassifier ,
          'Support Vector Machine' : NuSVC ,
          'Random Forest Classifier' : RandomForestClassifier
         }

# Model parameters
model_params = {
          'Logistic Regression' : [None] ,
          'K-Nearest Neighbour' : [None, {'n_neighbors' : 3}, {'n_neighbors' : 5}] ,
          'Naive Bayes' : [None] ,
          'Decision Tree' : [None] ,
          'Support Vector Machine' : [None, {'kernel':'linear'}, {'kernel':'poly'}, {'kernel':'sigmoid'}] ,
          'Random Forest Classifier' : [None, {'n_estimators':20},{'n_estimators':50},{'n_estimators':100}]
                }

In [169]:
# Train all the models
X_train = df_titan_train[model_features]
Y_train = df_titan_train[label]
X_test = df_titan_test[model_features]
Info = {}; Train_prediction = {}; Train_acc = {}; Test_prediction = {}
for mod_name, mod in models.items() :    
    for key, params in enumerate(model_params[mod_name]) :        
        name = mod_name + ' - ' + str(key)
        Info[name], Train_prediction[name], Train_acc[name], Test_prediction[name] =\
                    train_model(X_train, X_test, Y_train, mod, params)            
        

In [170]:
# Print the output in the order of training accuracy
for name, acc in sorted(Train_acc.items(), key=lambda x:x[1], reverse=True) :
    print('*'*90)
    print("Model \t\t\t : ",name)
    print('Training Accuracy \t : ',Train_acc[name],'%')
    print('Model info \t\t : ',Info[name])


******************************************************************************************
Model 			 :  Decision Tree - 0
Training Accuracy 	 :  89.45 %
Model info 		 :  DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
******************************************************************************************
Model 			 :  Random Forest Classifier - 2
Training Accuracy 	 :  89.45 %
Model info 		 :  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf