In [47]:
#### Import Libraries ####

import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from fancyimpute import KNN
from sklearn.metrics import accuracy_score

In [48]:
### Setting up Path ###

os.chdir("/Users/mac/Downloads/")

In [49]:
### Reading the titanic file ###

T1 = pd.read_csv("train.csv")

In [50]:
### Viewing Data ###

T1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [51]:
### Checking number of rows and columns ###

T1.shape

(891, 12)

In [52]:
### Checking for Missing Values if Any (In Percentage) ###

T1.isnull().sum()*100/len(T1)

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [53]:
### Dropping off "cabin" as it has 77% missing values and "Name" & "Ticket" as it does not add any values ###


T1 = T1.drop(['Name','Ticket','Cabin'],axis=1)

In [54]:
### Checking number of rows and columns after dropping some columns ###

T1.shape

(891, 9)

In [55]:
### Viewing the Data again ###

T1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [56]:
### Assigning categorical values to single variable ###

col_names=["Sex","Embarked"]

In [57]:
### Creating Dummies for the categorical Variable ###

for i in col_names:
    temp = pd.get_dummies(T1[i],prefix=i)
    T1 = T1.join(temp)


In [58]:
### Viewing the Data ###

T1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,male,22.0,1,0,7.25,S,0,1,0,0,1
1,2,1,1,female,38.0,1,0,71.2833,C,1,0,1,0,0
2,3,1,3,female,26.0,0,0,7.925,S,1,0,0,0,1
3,4,1,1,female,35.0,1,0,53.1,S,1,0,0,0,1
4,5,0,3,male,35.0,0,0,8.05,S,0,1,0,0,1


In [59]:
### Dropping off the variables for which we created dummies ###

T1 = T1.drop(['Sex','Embarked'],axis=1)

In [60]:
### Viewing the Data ###

T1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.925,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1,1,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,1,0,0,1


In [61]:
### Moving the Target Variable(Survived) at the last ###

T1 = T1[['PassengerId','Pclass','Age','SibSp','Parch','Fare','Sex_female','Sex_male','Embarked_C','Embarked_Q','Embarked_S','Survived']]





In [62]:
### Checking Up ###

T1.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived
0,1,3,22.0,1,0,7.25,0,1,0,0,1,0
1,2,1,38.0,1,0,71.2833,1,0,1,0,0,1
2,3,3,26.0,0,0,7.925,1,0,0,0,1,1
3,4,1,35.0,1,0,53.1,1,0,0,0,1,1
4,5,3,35.0,0,0,8.05,0,1,0,0,1,0


In [63]:
### Before building the Model, variable "Age" has some missing values,so we impute with KNN ###

### Imputing KNN for Age- Missing Values as it has 19% missing values ###

T1 = pd.DataFrame(KNN(k=3).fit_transform(T1),columns=T1.columns)

Imputing row 1/891 with 0 missing, elapsed time: 0.109
Imputing row 101/891 with 0 missing, elapsed time: 0.110
Imputing row 201/891 with 0 missing, elapsed time: 0.111
Imputing row 301/891 with 1 missing, elapsed time: 0.112
Imputing row 401/891 with 0 missing, elapsed time: 0.113
Imputing row 501/891 with 0 missing, elapsed time: 0.115
Imputing row 601/891 with 0 missing, elapsed time: 0.116
Imputing row 701/891 with 0 missing, elapsed time: 0.117
Imputing row 801/891 with 0 missing, elapsed time: 0.118


In [64]:
### Importing Library as we are going to implement Logistic Regression ###

import statsmodels.api as sm

In [65]:
### Dividing the data into train & test ###

train, test =train_test_split(T1, test_size=0.2)

In [66]:
### Building the ML Model ###

Model = sm.Logit(train['Survived'],train.iloc[:,0:11]).fit()

         Current function value: 0.440811
         Iterations: 35




In [67]:
### Now Assigning Test Dataset Columns to temp variable "columnsnew" ###

columsnew= test.columns

In [68]:
### Dropping of Target Variable from Test Dataset as we are going to predict it ###

columsnew = columsnew.drop('Survived')

In [69]:
### Checking up the variables once ###

columsnew

Index(['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female',
       'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [70]:
### Storing the predictions in variable called "Predictions" in test dataset ###

test['Predictions']=Model.predict(test[columsnew])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Predictions']=Model.predict(test[columsnew])


In [71]:
### As Predictions are in probabilities, we need to change that into Actual Values like 0 and 1 ###
### For which we create variable called Actual Value in Test Dataset and assign values to 1 for all ###

test['Actual Value'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['Actual Value'] = 1


In [72]:
### But if Prediction value is less than 0.5, then Actual Value will change to 0 ###

test.loc[test.Predictions < 0.5 , 'Actual Value'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [73]:
### Now calculating Accuracy Score ###

ACS = accuracy_score(test['Actual Value'], test['Survived'])

In [75]:
### The Accuracy of this Model for Titanic Survival is below ###

ACS * 100

### It is 82% ###

82.12290502793296