# Titanic Dataset

Predicting wheather a Passenger would survive or not

Attribute Information:
    
* Survived : Survival(0 = No, 1 = Yes)
* Pclass : Ticket class( 1 = 1st, 2 = 2nd, 3 = 3rd)
* sex : Sex 
* Age : Age in years
* SibSp : Number of siblings / spouses aboard the Titanic
* Parch : Number of parents / children aboard the Titanic
* Ticket : Ticket number
* Fare : Passenger fare
* Cabin : Cabin number
* Embarked : Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

In [67]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [69]:
df_titanic=pd.read_csv('titanic_train.csv')
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [None]:
#dropping Name nad PassengerId columns as it is of no use in the dataset
df_titanic.drop(['Name','PassengerId'],axis=1,inplace=True)

In [None]:
df_titanic.head()

In [None]:
df_titanic.shape

In [None]:
df_titanic.dtypes

In [None]:
df_titanic.Survived.value_counts()

In [None]:
df_titanic.Pclass.unique()

In [None]:
df_titanic.Pclass.value_counts()

In [None]:
#checking the null values
df_titanic.isnull().sum()

In [None]:
sns.heatmap(df_titanic.isnull())

Dataset has missing values in "Age", "Cabin" and "Embarked" columns

# Filling missing data

In [None]:
#filling null values of numerical data
df_titanic['Age']=df_titanic['Age'].replace(np.NaN,df_titanic['Age'].mean())

In [None]:
#filling null values of categorical data
df_titanic['Embarked'].fillna(df_titanic['Embarked'].mode().values[0],inplace=True)
df_titanic['Cabin'].fillna(df_titanic['Cabin'].mode().values[0],inplace=True)

In [None]:
df_titanic.isnull().sum()

In [None]:
df_titanic.info()

In [None]:
df_titanic.describe()

In [None]:
df_titanic.corr()

In [None]:
sns.heatmap(df_titanic.corr(), annot=True)

In [None]:
sns.lineplot(x='Age',y='Survived',data=df_titanic)

In [None]:
sns.scatterplot(x='Ticket',y='Fare',data=df_titanic)

In [None]:
sns.lineplot(x='Fare',y='Survived',data=df_titanic)
plt.show()

In [None]:
sns.countplot(x='Pclass',hue='Survived',data=df_titanic)
plt.show()

In [None]:
sns.barplot(x='Sex',y='Survived',data=df_titanic)
#Female passenger have more survival 

In [None]:
sns.barplot(x='SibSp',y='Survived',data=df_titanic)
#passengers with one or two sibblings or spouse have survived more.

In [None]:
sns.barplot(x='Parch',y='Survived',data=df_titanic)

In [None]:
sns.barplot(x='Embarked',y='Survived',data=df_titanic)
plt.show()

In [None]:
sns.distplot(df_titanic['Fare'])
#The data is skewed

In [None]:
sns.boxplot(df_titanic['SibSp'])
#There are few extreme outliers

In [None]:
sns.boxplot(df_titanic['Fare'])
#There are many outliers

In [None]:
#Removing outliers
print('The shape before removing outliers',df_titanic.shape)
from scipy.stats import zscore
z=np.abs(zscore(df_titanic))
df_titanic=df_titanic[(z<3).all(axis=1)]
print('The shape after removing outliers',df_titanic.shape)

In [None]:
df_titanic.skew()

In [None]:
#Reducing skewness using sqrt function
for col in df_titanic.columns:
    if df_titanic.skew().loc[col]>0.55:
        df_titanic[col]=np.sqrt(df_titanic[col])    

In [None]:
df_titanic.skew()

In [None]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df_titanic['Sex']=le.fit_transform(df_titanic['Sex'])
df_titanic['Cabin']=le.fit_transform(df_titanic['Cabin'])
df_titanic['Ticket']=le.fit_transform(df_titanic['Ticket'])
df_titanic['Embarked']=le.fit_transform(df_titanic['Embarked'])

In [None]:
#Further Analysis
df_titanic.groupby(['Age']).mean()

In [None]:
df_titanic['Age_group']=df_titanic['Age'].astype(int)
df_titanic['Age_group'].replace(range(0,6),('Infant'),inplace=True)
df_titanic['Age_group'].replace(range(6,19),('Child'),inplace=True)
df_titanic['Age_group'].replace(range(19,60),('Adult'),inplace=True)
df_titanic['Age_group'].replace(range(60,100),('Elderly'),inplace=True)

In [None]:
df_titanic[('Age_group')].value_counts()

In [None]:
df_titanic.groupby(['Age_group']).mean()
#The survival rate in infants is high.
#Fare for elderly seems to be higher than other age group

In [None]:
df_titanic.groupby(['Sex']).mean()
#The survival rate of female is higher than men
#Fare is higher for women which may indicate that women travelled in higher class

In [None]:
#dropping the Age_group column as we created it for our data analysis purpose
df_titanic.drop(['Age_group'],axis=1,inplace=True)

In [None]:
df_titanic.plot(kind='kde',subplots=True,layout=(4,3),sharex=False)
plt.show()

In [None]:
sns.pairplot(df_titanic)
plt.show()

In [None]:
#Splitting the data into target and input 
x=df_titanic.drop('Survived',axis=1)
y=pd.DataFrame(df_titanic['Survived'])

In [None]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
x = scale.fit_transform(x)

In [None]:
x.shape

In [None]:
y.shape

In [None]:
#Importing necessary libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,roc_auc_score,roc_curve,auc

In [None]:
#Finding out the best random state
max_accuracy_score=0
for r_state in range(42,101):
    x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=r_state,test_size=0.20)
    lg = LogisticRegression()
    lg.fit(x_train,y_train)
    pred = lg.predict(x_test)
    acc_scr=accuracy_score(y_test,pred)
    print("accuracy_score corresponding to random state:",r_state, " is: ",acc_scr)
    if acc_scr>max_accuracy_score:
        max_accuracy_score=acc_scr
        final_r_state=r_state
        
print("max accuracy score corresponding to",final_r_state," is ",max_accuracy_score)     

In [None]:
#Finalizing the tain_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=86,test_size=0.20)

In [None]:
from sklearn.model_selection import cross_val_score
cross_val_score(LogisticRegression(), x,y,cv=5,scoring="accuracy").mean()

# Using GridSearchCV to select the best parameter

In [None]:
#best parameters for SVC

from sklearn.model_selection import GridSearchCV
svc = SVC()
grid_param ={'kernel': ('linear','poly','rbf'), 'C':[1,10]}
gd=GridSearchCV(svc,grid_param)
gd.fit(x_train,y_train)
gd.best_params_

In [None]:
#best parameters for KNeighbors

knn=KNeighborsClassifier()
grid_param ={'n_neighbors':range(1,10)}
gd=GridSearchCV(knn,grid_param)
gd.fit(x_train,y_train)
gd.best_params_

In [None]:
#best parameters for DecisionTree

dtc=DecisionTreeClassifier()
grid_param ={'criterion': ['gini', 'entropy']}
gd=GridSearchCV(dtc,grid_param)
gd.fit(x_train,y_train)
gd.best_params_

In [None]:
#best parameters for RandomForest

from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier()
grid_param={"n_estimators":[10,100,500,1000]}
gd=GridSearchCV(rfc,grid_param)
gd.fit(x_train,y_train)
gd.best_params_

In [None]:
#Testing the models with the best parameters calculated

model=[]
score=[]
cvs=[]
rocscore=[]

for i in [LogisticRegression(),KNeighborsClassifier(n_neighbors=5),DecisionTreeClassifier(criterion='gini'),SVC(kernel='rbf',C=10),RandomForestClassifier(n_estimators=1000),GaussianNB()]:
    model.append(i)
    print('\n')
    i.fit(x_train,y_train)
    i.score(x_train,y_train)
    pred=i.predict(x_test)
    print(pred)
    ac=accuracy_score(y_test,pred)
    print('Accuracy score of',i,'is:',ac)
    score.append(ac)
    print('\n')
    cv_score=cross_val_score(i,x,y,cv=5,scoring='accuracy').mean()
    print('The CV Score is', cv_score)
    cvs.append(cv_score)
    print('\n')
    cm=confusion_matrix(y_test,pred)
    print(cm)
    print('\n')
    print(classification_report(y_test,pred))
    print('\n')
    false_positive_rate,true_positive_rate,thresholds=roc_curve(y_test,pred)
    roc_auc=auc(false_positive_rate,true_positive_rate)
    print('roc_auc_score',roc_auc)
    rocscore.append(roc_auc)
    print('\n')
    plt.figure(figsize=(10,40))
    plt.subplot(911)
    print(sns.heatmap(cm,annot=True))
    plt.subplot(912)
    plt.title(i)
    plt.plot(false_positive_rate,true_positive_rate,label='AUC = %0.2f'%roc_auc)
    plt.plot([0,1],[0,1],'r--')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.show()

In [None]:
result=pd.DataFrame({'Model':['LogisticRegression','KNeighborsClassifier','DecisionTreeClassifier','SVC','RandomForestClassifier','GaussianNB'],'Accuracy_Score':score,'Cross_val_score':cvs})
result

Since LogisticRegression is giving better results,we finalize the same

In [None]:
#Saving the model
from sklearn.externals import joblib
joblib.dump(lg,'titanic_lg.pkl')