In [67]:
import pandas as pd
import numpy as np

In [68]:
from sklearn.model_selection import train_test_split

In [69]:
df = pd.read_csv('./titanic/train.csv')
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [70]:
df_t = pd.read_csv('./titanic/test.csv')
df_t

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


Comparing the train data with the test data, Survived feature is the target to be predicted.

Select the features to predict target using in ML.

## Data Dictionary

Variable	Definition	Key

survival	Survival	0 = No, 1 = Yes

pclass	Ticket class	1 = 1st, 2 = 2nd, 3 = 3rd

sex	Sex	

Age	Age in years	

sibsp	# of siblings / spouses aboard the Titanic	

parch	# of parents / children aboard the Titanic	

ticket	Ticket number	

fare	Passenger fare	

cabin	Cabin number	

embarked	Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton

Variable Notes

pclass: A proxy for socio-economic status (SES)

1st = Upper

2nd = Middle

3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way...

Sibling = brother, sister, stepbrother, stepsister

Spouse = husband, wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way...

Parent = mother, father

Child = daughter, son, stepdaughter, stepson

Some children travelled only with a nanny, therefore parch=0 for them.


In [71]:
# select features(pclass, sex, fare, cabin), target(survived)

X = df.loc[:,['Pclass', 'Sex', 'Cabin']]
y = df.loc[:, 'Survived']

In [72]:
X.isna().sum()

Pclass      0
Sex         0
Cabin     687
dtype: int64

In [73]:
X.fillna(0, inplace=True)
X['Cabin'] = np.where(X['Cabin'] == 0, 0, 1)

In [74]:
X.isna().sum()

Pclass    0
Sex       0
Cabin     0
dtype: int64

In [75]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Pclass  891 non-null    int64 
 1   Sex     891 non-null    object
 2   Cabin   891 non-null    int32 
dtypes: int32(1), int64(1), object(1)
memory usage: 17.5+ KB


In [76]:
y.info()

<class 'pandas.core.series.Series'>
RangeIndex: 891 entries, 0 to 890
Series name: Survived
Non-Null Count  Dtype
--------------  -----
891 non-null    int64
dtypes: int64(1)
memory usage: 7.1 KB


In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [78]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer

In [79]:
pipe = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    DecisionTreeClassifier(random_state=42, criterion='entropy')
)


In [80]:
X_train

Unnamed: 0,Pclass,Sex,Cabin
445,1,male,1
650,3,male,0
172,3,female,0
450,2,male,0
314,2,male,0
...,...,...,...
106,3,female,0
270,1,male,0
860,3,male,0
435,1,female,1


In [81]:
pipe.fit(X_train, y_train)

print(pipe.score(X_train, y_train))

0.8009630818619583


In [82]:
X_test

Unnamed: 0,Pclass,Sex,Cabin
709,3,male,0
439,2,male,0
840,3,male,0
720,2,female,0
39,3,female,0
...,...,...,...
821,3,male,0
633,1,male,0
456,1,male,1
500,3,male,0


In [83]:
from sklearn.metrics import classification_report

In [84]:
y_pred = pipe.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.98      0.86       392
           1       0.95      0.49      0.65       231

    accuracy                           0.80       623
   macro avg       0.86      0.74      0.75       623
weighted avg       0.83      0.80      0.78       623



In [85]:
pipe_fin = make_pipeline(
    OrdinalEncoder(),
    SimpleImputer(),
    DecisionTreeClassifier(random_state=42, criterion='entropy')
)
pipe_fin.fit(X, y)

print(pipe_fin.score(X, y))

0.7912457912457912


In [86]:
X_fin = df_t.loc[:,['Pclass', 'Sex', 'Cabin']]
X_fin.fillna(0, inplace=True)
X_fin['Cabin'] = np.where(X_fin['Cabin'] == 0, 0, 1)

In [87]:
X_fin

Unnamed: 0,Pclass,Sex,Cabin
0,3,male,0
1,3,female,0
2,2,male,0
3,3,male,0
4,3,female,0
...,...,...,...
413,3,male,0
414,1,female,1
415,3,male,0
416,3,male,0


In [88]:
pipe_fin

In [89]:
y_fin_pred = pipe_fin.predict(X_fin)

In [93]:
len(y_fin_pred)

418

In [97]:
submission = pd.DataFrame({'PassengerId' : df_t['PassengerId'], 'Survived' : y_fin_pred})
submission.to_csv('submission.csv', index=False)