In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [10]:
df = pd.read_csv('titanic.csv')
df.shape

(891, 12)

In [34]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [53]:
df.isnull().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

## Handling Missing values:

### Drop the NaN rows in Age column, however, causing lost of 177 instances 

In [59]:
df_ = df.drop(['Cabin', 'Embarked'], axis = 'columns')
df_.shape

(891, 10)

In [60]:
df_ = df_.dropna()
df_.shape

(714, 10)

In [61]:
X = df_[['Pclass', 'Sex', 'Age', 'Fare']]
X.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [62]:
X.shape

(714, 4)

In [63]:
X.isnull().any()

Pclass    False
Sex       False
Age       False
Fare      False
dtype: bool

In [64]:
le = LabelEncoder()
X['Sex_n'] = le.fit_transform(X['Sex'])
X.head()

# male: 1; female: 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Sex_n'] = le.fit_transform(X['Sex'])


Unnamed: 0,Pclass,Sex,Age,Fare,Sex_n
0,3,male,22.0,7.25,1
1,1,female,38.0,71.2833,0
2,3,female,26.0,7.925,0
3,1,female,35.0,53.1,0
4,3,male,35.0,8.05,1


In [65]:
X = X.drop('Sex', axis = 'columns')
X.head()

Unnamed: 0,Pclass,Age,Fare,Sex_n
0,3,22.0,7.25,1
1,1,38.0,71.2833,0
2,3,26.0,7.925,0
3,1,35.0,53.1,0
4,3,35.0,8.05,1


In [69]:
y = df_.Survived

In [70]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [71]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [72]:
model.score(X_test, y_test)

0.7762237762237763

## Handling Missing values:

### Using Multivariate Imputation By Chained Equations(MICE) to impute missing values 

In [137]:
df.shape

(891, 12)

In [138]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [139]:
X = df[['Pclass', 'Age', 'Fare']]
X.shape

(891, 3)

In [141]:
X.isna().sum()

Pclass      0
Age       177
Fare        0
dtype: int64

In [115]:
X.corr()
# not direct linear regression relationship 

Unnamed: 0,Pclass,Age,Fare
Pclass,1.0,-0.369226,-0.5495
Age,-0.369226,1.0,0.096067
Fare,-0.5495,0.096067,1.0


In [150]:
from sklearn.experimental import enable_iterative_imputer # required to use IterativeImputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(verbose=2, max_iter=30, tol=1e-10, imputation_order='ascending' )

In [152]:
imp.fit(X)
X_imputed = imp.transform(X)

[IterativeImputer] Completing matrix with shape (891, 3)
[IterativeImputer] Ending imputation round 1/30, elapsed time 0.01
[IterativeImputer] Change: 11.063066625639827, scaled tolerance: 5.123292e-08 
[IterativeImputer] Ending imputation round 2/30, elapsed time 0.02
[IterativeImputer] Change: 0.0, scaled tolerance: 5.123292e-08 
[IterativeImputer] Early stopping criterion reached.
[IterativeImputer] Completing matrix with shape (891, 3)
[IterativeImputer] Ending imputation round 1/2, elapsed time 0.00
[IterativeImputer] Ending imputation round 2/2, elapsed time 0.00


In [153]:
X_imputed

array([[ 3.        , 22.        ,  7.25      ],
       [ 1.        , 38.        , 71.2833    ],
       [ 3.        , 26.        ,  7.925     ],
       ...,
       [ 3.        , 24.23762903, 23.45      ],
       [ 1.        , 26.        , 30.        ],
       [ 3.        , 32.        ,  7.75      ]])

In [154]:
X.Age = X_imputed[:, 1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [155]:
X.isna().sum()

Pclass    0
Age       0
Fare      0
dtype: int64

In [156]:
X['Sex'] = df.Sex 
X.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Sex'] = df.Sex


(891, 4)

In [157]:
X.isnull().any()

Pclass    False
Age       False
Fare      False
Sex       False
dtype: bool

In [158]:
le = LabelEncoder()
X['Sex_n'] = le.fit_transform(X['Sex'])
X = X.drop('Sex', axis = 'columns')
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Sex_n'] = le.fit_transform(X['Sex'])


Unnamed: 0,Pclass,Age,Fare,Sex_n
0,3,22.0,7.25,1
1,1,38.0,71.2833,0
2,3,26.0,7.925,0
3,1,35.0,53.1,0
4,3,35.0,8.05,1


In [159]:
y = df.Survived
y.shape

(891,)

In [160]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [161]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [162]:
model.score(X_test, y_test)

0.776536312849162

## Handling Missing values:

### Using SimpleImputer to impute mean or median values - Worse performance as expected, since average or median age does not fit the scenario of this dataset

In [170]:
X = df[['Pclass', 'Age', 'Fare']]
X.isna().sum()

Pclass      0
Age       177
Fare        0
dtype: int64

In [171]:
from sklearn.impute import SimpleImputer

# Mean Imputation
mean_imputer = SimpleImputer(strategy='mean')
mean_imputed = mean_imputer.fit_transform(X)

# Median Imputation
median_imputer = SimpleImputer(strategy='median')
median_imputed = median_imputer.fit_transform(X)

In [172]:
# train model when na age is imputed by mean value
X['Age_mean'] = mean_imputed[:, 1]
X['Age_median'] = median_imputed[:, 1]
X.isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Age_mean'] = mean_imputed[:, 1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Age_median'] = median_imputed[:, 1]


Pclass          0
Age           177
Fare            0
Age_mean        0
Age_median      0
dtype: int64

In [173]:
# one-hot-encoding Sex feature
X['Sex'] = df.Sex 
le = LabelEncoder()
X['Sex_n'] = le.fit_transform(X['Sex'])
X = X.drop('Sex', axis = 'columns')

# target variable
y = df.Survived

# Feature varibales 
X_mean = X[['Pclass', 'Age_mean', 'Fare']]
X_median = X[['Pclass', 'Age_median', 'Fare']]

from sklearn.model_selection import train_test_split 
X_mean_train, X_mean_test, y_train, y_test = train_test_split(X_mean, y, test_size = 0.2)
X_median_train, X_median_test, y_train, y_test = train_test_split(X_mean, y, test_size = 0.2)

from sklearn.tree import DecisionTreeClassifier
model_mean = DecisionTreeClassifier()
model_median = DecisionTreeClassifier()
model_mean.fit(X_mean_train, y_train)
model_median.fit(X_median_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Sex'] = df.Sex
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Sex_n'] = le.fit_transform(X['Sex'])


DecisionTreeClassifier()

In [174]:
model_mean.score(X_mean_test, y_test)

0.5083798882681564

In [175]:
model_median.score(X_median_test, y_test)

0.6983240223463687