In [26]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

from sklearn.preprocessing import Binarizer

In [2]:
df = pd.read_csv('train.csv', usecols = ['Age','Fare','SibSp','Parch','Survived'])
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare
0,0,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,1,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,0,35.0,0,0,8.05


In [3]:
df.isnull().sum()

Survived      0
Age         177
SibSp         0
Parch         0
Fare          0
dtype: int64

In [4]:
df.shape

(891, 5)

In [5]:
df.dropna(inplace = True)

In [6]:
df.isnull().sum()

Survived    0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [7]:
df.shape

(714, 5)

In [9]:
df['Family'] = df['SibSp'] + df['Parch']

In [10]:
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Family
0,0,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,1
2,1,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,1
4,0,35.0,0,0,8.05,0


In [12]:
df.drop(columns = ['SibSp','Parch'], inplace = True)

In [13]:
df.head()

Unnamed: 0,Survived,Age,Fare,Family
0,0,22.0,7.25,1
1,1,38.0,71.2833,1
2,1,26.0,7.925,0
3,1,35.0,53.1,1
4,0,35.0,8.05,0


In [16]:
X = df.drop(columns = ['Survived'])
y = df['Survived']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [18]:
X_train.head()

Unnamed: 0,Age,Fare,Family
328,31.0,20.525,2
73,26.0,14.4542,1
253,30.0,16.1,1
719,33.0,7.775,0
666,25.0,13.0,0


In [19]:
X_test.head()

Unnamed: 0,Age,Fare,Family
149,42.0,13.0,0
407,3.0,18.75,2
53,29.0,26.0,1
369,24.0,69.3,0
818,43.0,6.45,0


In [20]:
y_train.head()

328    1
73     0
253    0
719    0
666    0
Name: Survived, dtype: int64

In [21]:
y_test.head()

149    0
407    1
53     1
369    1
818    0
Name: Survived, dtype: int64

## Without Binarization

In [28]:
clf = DecisionTreeClassifier()

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

accuracy_score(y_test, y_pred)

0.6293706293706294

In [29]:
np.mean(cross_val_score(DecisionTreeClassifier(), X,y, scoring='accuracy'))

np.float64(0.6596473948586625)

## Apply Binarization

In [33]:
trf = ColumnTransformer(transformers=[
    ('bin', Binarizer(copy = False), ['Family'])
], remainder = 'passthrough')

In [34]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [35]:
pd.DataFrame(X_train_trf, columns = ['Family','Age', 'Fare'])

Unnamed: 0,Family,Age,Fare
0,1.0,31.0,20.5250
1,1.0,26.0,14.4542
2,1.0,30.0,16.1000
3,0.0,33.0,7.7750
4,0.0,25.0,13.0000
...,...,...,...
566,1.0,46.0,61.1750
567,0.0,25.0,13.0000
568,0.0,41.0,134.5000
569,1.0,33.0,20.5250


In [36]:
clf = DecisionTreeClassifier()

clf.fit(X_train_trf, y_train)
y_pred2 = clf.predict(X_test_trf)

accuracy_score(y_pred2, y_test)

0.6433566433566433

In [37]:
x_transform = trf.fit_transform(X)

np.mean(cross_val_score(DecisionTreeClassifier(), x_transform, y))

np.float64(0.6218654584851768)