In [1]:
#import the numpy and panda modules to process the data
import pandas as pd
import numpy as np


In [2]:
#Reading the data from csv file to dataframe
titanic_df = pd.read_csv('titanic.csv')

In [3]:
#first 5 rows of dataframe
titanic_df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare,Unnamed: 8,Unnamed: 9
0,0,3.0,Mr. Owen Harris Braund,male,22.0,1.0,0,7.25,,
1,1,1.0,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1.0,0,71.2833,,
2,1,3.0,Miss. Laina Heikkinen,female,26.0,0.0,0,7.925,,
3,1,1.0,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1.0,0,53.1,,
4,0,3.0,Mr. William Henry Allen,male,35.0,0.0,0,8.05,,0.0


In [4]:
#getting the information of datatypes of all columns
titanic_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 888 entries, 0 to 887
Data columns (total 10 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 888 non-null    object 
 1   Pclass                   887 non-null    float64
 2   Name                     887 non-null    object 
 3   Sex                      887 non-null    object 
 4   Age                      888 non-null    float64
 5   Siblings/Spouses Aboard  887 non-null    float64
 6   Parents/Children Aboard  888 non-null    int64  
 7   Fare                     888 non-null    float64
 8   Unnamed: 8               0 non-null      float64
 9   Unnamed: 9               6 non-null      float64
dtypes: float64(6), int64(1), object(3)
memory usage: 69.5+ KB


As we can see there are total of 888 rows in the dataset out of which Unnamed: 8 and Unnamed: 9 are having almost Null values. Hence those columns will be of no use. We can drop those 2 columns.

In [6]:
titanic_df=titanic_df.drop(['Unnamed: 8','Unnamed: 9'], axis=1)

In [7]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3.0,Mr. Owen Harris Braund,male,22.0,1.0,0,7.25
1,1,1.0,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1.0,0,71.2833
2,1,3.0,Miss. Laina Heikkinen,female,26.0,0.0,0,7.925
3,1,1.0,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1.0,0,53.1
4,0,3.0,Mr. William Henry Allen,male,35.0,0.0,0,8.05


Here our aim is to classify whether a passenger survived or not. So the column 'Name' that indicates the name of passenger will not contribute for module. We can drop that column. 

In [8]:
titanic_df=titanic_df.drop(['Name'], axis=1)

In [9]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3.0,male,22.0,1.0,0,7.25
1,1,1.0,female,38.0,1.0,0,71.2833
2,1,3.0,female,26.0,0.0,0,7.925
3,1,1.0,female,35.0,1.0,0,53.1
4,0,3.0,male,35.0,0.0,0,8.05


As we can see from info that few columns has 1 missing value we can consider dropping them or imputing them

In [10]:
titanic_df.dropna(inplace=True)

In [11]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 887 entries, 0 to 886
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    object 
 1   Pclass                   887 non-null    float64
 2   Sex                      887 non-null    object 
 3   Age                      887 non-null    float64
 4   Siblings/Spouses Aboard  887 non-null    float64
 5   Parents/Children Aboard  887 non-null    int64  
 6   Fare                     887 non-null    float64
dtypes: float64(4), int64(1), object(2)
memory usage: 55.4+ KB


We have two columns, Survived and Sex which are in categorical form. We can convert those 2 columns into numerical format

In [12]:
# Converting categorical feature to numeric
#encoding female as 0 and male as 1
titanic_df.Sex=titanic_df.Sex.map({'female':0, 'male':1})

In [13]:
#converting string type to int
titanic_df['Survived'] = titanic_df['Survived'].astype('int')

In [14]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 887 entries, 0 to 886
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    int32  
 1   Pclass                   887 non-null    float64
 2   Sex                      887 non-null    int64  
 3   Age                      887 non-null    float64
 4   Siblings/Spouses Aboard  887 non-null    float64
 5   Parents/Children Aboard  887 non-null    int64  
 6   Fare                     887 non-null    float64
dtypes: float64(4), int32(1), int64(2)
memory usage: 52.0 KB


Now all the columns are in numeric data types.

We can now split the dataset into 2. 
-> One containing independent variables
-> Another containing only dependent variable/ Target Variable.
In this case 'Survived' will be target variable as we are trying to predict that column

In [15]:

X = titanic_df.drop('Survived',axis=1)

y = titanic_df.pop('Survived')

In [16]:
X.head()

Unnamed: 0,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,3.0,1,22.0,1.0,0,7.25
1,1.0,0,38.0,1.0,0,71.2833
2,3.0,0,26.0,0.0,0,7.925
3,1.0,0,35.0,1.0,0,53.1
4,3.0,1,35.0,0.0,0,8.05


In [17]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int32

Now we will split the data into training and testing sets wilth 80:20 ratio. To do that we need to import sklearn.model_selection

In [18]:
from sklearn.model_selection import train_test_split,cross_val_score

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
#checking the shape of train set
X_train.shape

(709, 6)

In [21]:
#checking the shape of test set
X_test.shape

(178, 6)

We will now try building a model using Logistic Regression and check the scores for train and test data

In [22]:
#importing Logistic regression module from sklearn
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train, y_train)

LogisticRegression()

To check the score we need to import sklearn.metrics module

In [24]:
from sklearn.metrics import accuracy_score

#checking score for train set
Y_pred = LR.predict(X_train)
accuracy_score(y_train, Y_pred)

0.8067700987306065

accuracy score for training set is seem to be 80.68%

In [25]:

#checking score for test set
Y_pred = LR.predict(X_test)
accuracy_score(y_test, Y_pred)

0.7865168539325843

accuracy score for testing set is 78.65%. 
Let us also try other models such as Decision tree

In [26]:
from sklearn import tree


DT_model= tree.DecisionTreeClassifier(random_state=1)
DT_model.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1)

In [27]:

#checking score for train set
Y_pred = DT_model.predict(X_train)
accuracy_score(y_train, Y_pred)

0.9816643159379408

In [33]:
#checking score for test set
Y_pred = DT_model.predict(X_test)
accuracy_score(y_test, Y_pred)
model_score = DT_model.score(X_test, y_test)
print(model_score)

0.7696629213483146


In [29]:
from sklearn.ensemble import RandomForestClassifier

RF_model = RandomForestClassifier(n_estimators=200, random_state=0)
RF_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [30]:

#checking score for train set
Y_pred = DT_model.predict(X_train)
accuracy_score(y_train, Y_pred)

0.9816643159379408

In [34]:
#checking score for test set
Y_pred = DT_model.predict(X_test)
accuracy_score(y_test, Y_pred)

0.7696629213483146