In [52]:
# Import libraries

# Data analysis and wrangling
import pandas as pd

# For plots
import matplotlib.pyplot as plt
import seaborn as sns

# To have plot inline with jupyter notebook
% matplotlib inline



# Machine Learning models

# Logistic Regression
from sklearn.linear_model import LogisticRegression

# K-Nearest Neighbour
from sklearn.neighbors import KNeighborsClassifier

# Naive-Bayes classifier
from sklearn.naive_bayes import GaussianNB

# Decision tree classifier
from sklearn.tree import DecisionTreeClassifier



In [42]:
# Load training and test dataset
df_titan_train = pd.read_csv("Titanic/df_train.csv")
df_titan_test = pd.read_csv("Titanic/df_test.csv")

In [43]:
# Get basic information from both the datasets
print('_'*40)
print('Training Dataset')
df_titan_train.info()
print('_'*40)
print('_'*40)
print('Test Dataset')
df_titan_test.info()

________________________________________
Training Dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 31 columns):
PassengerId               891 non-null int64
Survived                  891 non-null int64
Pclass                    891 non-null int64
SibSp                     891 non-null int64
Parch                     891 non-null int64
Agegrp                    891 non-null int64
Faregrp                   891 non-null int64
Pclass_1                  891 non-null int64
Pclass_2                  891 non-null int64
Pclass_3                  891 non-null int64
Embarked_C                891 non-null int64
Embarked_Q                891 non-null int64
Embarked_S                891 non-null int64
Sex_title_female Miss.    891 non-null int64
Sex_title_female Mrs.     891 non-null int64
Sex_title_female rare     891 non-null int64
Sex_title_male Master.    891 non-null int64
Sex_title_male Mr.        891 non-null int64
Sex_title_male rare       8

In [44]:
# Features and labels
features = df_titan_test.columns
label = 'Survived'
print("Features :{} \n\nLabels :{}".format(features,label))

Features :Index(['PassengerId', 'Pclass', 'SibSp', 'Parch', 'Agegrp', 'Faregrp',
       'Pclass_1', 'Pclass_2', 'Pclass_3', 'Embarked_C', 'Embarked_Q',
       'Embarked_S', 'Sex_title_female Miss.', 'Sex_title_female Mrs.',
       'Sex_title_female rare', 'Sex_title_male Master.', 'Sex_title_male Mr.',
       'Sex_title_male rare', 'Agegrp_0', 'Agegrp_1', 'Agegrp_2', 'Agegrp_3',
       'Agegrp_4', 'Agegrp_5', 'Faregrp_0', 'Faregrp_1', 'Faregrp_2',
       'Faregrp_3', 'Faregrp_4', 'Faregrp_5'],
      dtype='object') 

Labels :Survived


In [45]:
# Features required for model
model_features = features[1:]
model_features

Index(['Pclass', 'SibSp', 'Parch', 'Agegrp', 'Faregrp', 'Pclass_1', 'Pclass_2',
       'Pclass_3', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'Sex_title_female Miss.', 'Sex_title_female Mrs.',
       'Sex_title_female rare', 'Sex_title_male Master.', 'Sex_title_male Mr.',
       'Sex_title_male rare', 'Agegrp_0', 'Agegrp_1', 'Agegrp_2', 'Agegrp_3',
       'Agegrp_4', 'Agegrp_5', 'Faregrp_0', 'Faregrp_1', 'Faregrp_2',
       'Faregrp_3', 'Faregrp_4', 'Faregrp_5'],
      dtype='object')

In [48]:
# Logistic Regression - Model
Log_model = LogisticRegression()
Log_model.fit(df_titan_train[model_features],df_titan_train[label])
Log_train_prediction = Log_model.predict(df_titan_train[model_features])
print('Training Accuracy : ',round(Log_model.score(df_titan_train[model_features],df_titan_train[label])*100,2),'%')
Log_test_prediction = Log_model.predict(df_titan_test[model_features])

Training Accuracy :  83.28 %


In [50]:
# K-Nearest Neighbour - Model
KNN_model = KNeighborsClassifier()
KNN_model.fit(df_titan_train[model_features],df_titan_train[label])
KNN_train_prediction = KNN_model.predict(df_titan_train[model_features])
print('Training Accuracy : ',round(KNN_model.score(df_titan_train[model_features],df_titan_train[label])*100,2),'%')
KNN_test_prediction = KNN_model.predict(df_titan_test[model_features])

Training Accuracy :  85.07 %


In [51]:
# Naive Bayes - Model
GNB_model = GaussianNB()
GNB_model.fit(df_titan_train[model_features],df_titan_train[label])
GNB_train_prediction = GNB_model.predict(df_titan_train[model_features])
print('Training Accuracy : ',round(GNB_model.score(df_titan_train[model_features],df_titan_train[label])*100,2),'%')
GNB_test_prediction = GNB_model.predict(df_titan_test[model_features])

Training Accuracy :  68.91 %


In [53]:
# Decision Tree - Model
DT_model = DecisionTreeClassifier()
DT_model.fit(df_titan_train[model_features],df_titan_train[label])
DT_train_prediction = DT_model.predict(df_titan_train[model_features])
print('Training Accuracy : ',round(DT_model.score(df_titan_train[model_features],df_titan_train[label])*100,2),'%')
DT_test_prediction = DT_model.predict(df_titan_test[model_features])

Training Accuracy :  89.45 %
