In [2]:

# load libraries 

In [68]:
import warnings
warnings.filterwarnings("ignore")

In [40]:
import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd 

In [41]:
# filling missing data 
# models from sklearn package
from sklearn.experimental  import enable_iterative_imputer
from sklearn import (impute, model_selection) 

In [42]:
# read data from an excel file directly from a folder
# the original link to data in the book (ML pocket reference) is not working
df = pd.read_excel('titanic3.xlsx')
orig_df = df 
orig_df.head() # check the first few rows

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [43]:
# dimension of the data table 
df.shape

(1309, 14)

In [44]:
# check for missing data 
df.isnull().mean()*100

pclass        0.000000
survived      0.000000
name          0.000000
sex           0.000000
age          20.091673
sibsp         0.000000
parch         0.000000
ticket        0.000000
fare          0.076394
cabin        77.463713
embarked      0.152788
boat         62.872422
body         90.756303
home.dest    43.086325
dtype: float64

In [None]:
# check the first two columns
df.describe().iloc[:, :2]

Unnamed: 0,pclass,survived
count,1309.0,1309.0
mean,2.294882,0.381971
std,0.837836,0.486055
min,1.0,0.0
25%,2.0,0.0
50%,3.0,0.0
75%,3.0,1.0
max,3.0,1.0


In [8]:
# how many missing values or NA
df.isnull().sum()

pclass          0
survived        0
name            0
sex             0
age           263
sibsp           0
parch           0
ticket          0
fare            1
cabin        1014
embarked        2
boat          823
body         1188
home.dest     564
dtype: int64

In [45]:
df.isnull().sum(axis=1).loc[:2]

0    1
1    1
2    2
dtype: int64

In [46]:
# drop the 'useless' columns
df = df.drop(
    columns = ["name", "ticket", "home.dest", "boat", "body", "cabin"]
)

In [47]:
df.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked
0,1,1,female,29.0,0,0,211.3375,S
1,1,1,male,0.9167,1,2,151.55,S
2,1,0,female,2.0,1,2,151.55,S
3,1,0,male,30.0,1,2,151.55,S
4,1,0,female,25.0,1,2,151.55,S


In [48]:
df.pclass.value_counts()

pclass
3    709
1    323
2    277
Name: count, dtype: int64

In [49]:
# one-hot encoding
df = pd.get_dummies(df)

In [50]:
# check the number of columns, and the way it is coded 
df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,1,29.0,0,0,211.3375,True,False,False,False,True
1,1,1,0.9167,1,2,151.55,False,True,False,False,True
2,1,0,2.0,1,2,151.55,True,False,False,False,True
3,1,0,30.0,1,2,151.55,False,True,False,False,True
4,1,0,25.0,1,2,151.55,True,False,False,False,True


In [51]:
df = df.drop(columns = "sex_male")

In [52]:
df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_female,embarked_C,embarked_Q,embarked_S
0,1,1,29.0,0,0,211.3375,True,False,False,True
1,1,1,0.9167,1,2,151.55,False,False,False,True
2,1,0,2.0,1,2,151.55,True,False,False,True
3,1,0,30.0,1,2,151.55,False,False,False,True
4,1,0,25.0,1,2,151.55,True,False,False,True


In [53]:
# reduce degree of freedom by one
df = pd.get_dummies(df, drop_first=True)

In [None]:
# not sure the embarked_C is still here
df.head()

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_female,embarked_C,embarked_Q,embarked_S
0,1,1,29.0,0,0,211.3375,True,False,False,True
1,1,1,0.9167,1,2,151.55,False,False,False,True
2,1,0,2.0,1,2,151.55,True,False,False,True
3,1,0,30.0,1,2,151.55,False,False,False,True
4,1,0,25.0,1,2,151.55,True,False,False,True


In [55]:
# prepare the data input and label / target columns 
y = df.survived
X = df.drop(columns = "survived")
X.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,embarked_C,embarked_Q,embarked_S
0,1,29.0,0,0,211.3375,True,False,False,True
1,1,0.9167,1,2,151.55,False,False,False,True
2,1,2.0,1,2,151.55,True,False,False,True
3,1,30.0,1,2,151.55,False,False,False,True
4,1,25.0,1,2,151.55,True,False,False,True


In [22]:
# split the data using sklearn functions 
X_train, X_test, y_train, y_test = model_selection.train_test_split(
    X, y, test_size=0.2, random_state=42
)
X_train.shape

(1047, 9)

In [23]:
# check how many rows of test data set
X_test.shape

(262, 9)

In [56]:
# deal with missing values
from sklearn import impute 
num_cols = [
    "pclass", "age", "sibsp", "parch", "fare", "sex_female"
]

In [None]:
imputer = impute.IterativeImputer() 
imputed = imputer.fit_transform(X_train[num_cols])

# apply the same method to test data set too
X_train.loc[:, num_cols] = imputed 
imputed = imputer.fit_transform(X_test[num_cols])
X_test.loc[:, num_cols] = imputed

In [58]:
X_test.head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,embarked_C,embarked_Q,embarked_S
0,0.840359,0.447694,-0.495964,-0.442432,-0.509695,-0.739276,-0.499403,-0.324956,0.648555
1,0.840359,-0.685925,0.456833,0.676472,-0.343259,-0.739276,2.002391,-0.324956,-1.541888
2,0.840359,-0.137134,-0.495964,-0.442432,-0.494806,-0.739276,-0.499403,-0.324956,0.648555
3,0.840359,-0.136842,-0.495964,-0.442432,-0.491828,-0.739276,-0.499403,-0.324956,0.648555
4,0.840359,-0.13741,-0.495964,-0.442432,-0.497622,-0.739276,-0.499403,3.077337,-1.541888


In [59]:
# no missing values anymore
X_test.isnull().sum()

pclass        0
age           0
sibsp         0
parch         0
fare          0
sex_female    0
embarked_C    0
embarked_Q    0
embarked_S    0
dtype: int64

In [28]:
# select features / colmns as input data 
cols = ['pclass', 'age', 'sibsp', 'parch', 'fare', 'sex_female', 'embarked_C',
   'embarked_Q', 'embarked_S']
sca = preprocessing.StandardScaler()
X_train = sca.fit_transform(X_train)
X_train = pd.DataFrame(X_train, columns=cols)
X_test = sca.transform(X_test)
X_test = pd.DataFrame(X_test, columns=cols)

In [60]:
# select dummy model as baseline
from sklearn.dummy import DummyClassifier 
dc = DummyClassifier()
dc.fit(X_train, y_train) 
dc.score(X_test, y_test)

0.549618320610687

In [61]:
from sklearn.metrics import (confusion_matrix, 
                             accuracy_score, 
                             classification_report)

In [62]:
# predict on test data
y_pred = dc.predict(X_test)

In [63]:
# check the confusion matrix
print("confusion matrix: \n", confusion_matrix(y_test, y_pred))

confusion matrix: 
 [[144   0]
 [118   0]]


In [69]:
print("report: \n", classification_report(y_test, y_pred))

report: 
               precision    recall  f1-score   support

           0       0.55      1.00      0.71       144
           1       0.00      0.00      0.00       118

    accuracy                           0.55       262
   macro avg       0.27      0.50      0.35       262
weighted avg       0.30      0.55      0.39       262



In [65]:
# check the NaN in the data frame 
mask = orig_df.isnull().any(axis = 1)
mask

0       True
1       True
2       True
3       True
4       True
        ... 
1304    True
1305    True
1306    True
1307    True
1308    True
Length: 1309, dtype: bool

In [66]:
orig_df[mask].body.head()

0      NaN
1      NaN
2      NaN
3    135.0
4      NaN
Name: body, dtype: float64

In [67]:
orig_df[mask].head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
