In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer 

In [2]:
df = pd.read_csv('train.csv',usecols=['Age','Fare','Survived'])

In [3]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [4]:
df.dropna(inplace=True)

In [5]:
df.isnull().sum()

Survived    0
Age         0
Fare        0
dtype: int64

In [6]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [7]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [9]:
X_train.head()

Unnamed: 0,Age,Fare
328,31.0,20.525
73,26.0,14.4542
253,30.0,16.1
719,33.0,7.775
666,25.0,13.0


In [10]:
clf = DecisionTreeClassifier()

In [11]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)

In [12]:
accuracy_score(y_test,y_pred)

0.6153846153846154

In [13]:
np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy'))

0.6358763693270735

In [33]:
kbin_age = KBinsDiscretizer(n_bins=15,encode = 'ordinal',strategy='quantile')
kbin_fare = KBinsDiscretizer(n_bins=15,encode = 'ordinal',strategy='quantile')

In [15]:
trf = ColumnTransformer([
	('first',kbin_age,[0]),
	('second',kbin_fare,[1])
])

In [17]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [24]:
X_train_trf

array([[5., 5.],
       [4., 4.],
       [5., 5.],
       ...,
       [7., 9.],
       [6., 5.],
       [6., 2.]])

In [18]:
trf.named_transformers_

{'first': KBinsDiscretizer(encode='ordinal', n_bins=10),
 'second': KBinsDiscretizer(encode='ordinal', n_bins=10)}

In [19]:
trf.named_transformers_['first'].n_bins_

array([10])

In [20]:
trf.named_transformers_['second'].n_bins_

array([10])

In [22]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42, 14.  , 19.  , 22.  , 25.  , 28.5 , 32.  , 36.  , 42.  ,
              50.  , 80.  ])                                                ],
      dtype=object)

In [23]:
trf.named_transformers_['second'].bin_edges_

array([array([  0.    ,   7.75  ,   7.8958,   9.225 ,  13.    ,  15.75  ,
               26.    ,  29.125 ,  51.4792,  82.1708, 512.3292])         ],
      dtype=object)

In [25]:
output = pd.DataFrame({
	'age':X_train['Age'],
	'age_trf':X_train_trf[:,0], 
	'fare':X_train['Fare'], 
	'fare_trf':X_train_trf[:,1]
})

In [28]:
output['age_labels'] = pd.cut(x = X_train['Age'], 
							 bins = trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_labels'] = pd.cut(x = X_train['Fare'], 
							  bins = trf.named_transformers_['second'].bin_edges_[0].tolist())

In [29]:
output.sample(5)

Unnamed: 0,age,age_trf,fare,fare_trf,age_labels,fare_labels
701,35.0,6.0,26.2875,6.0,"(32.0, 36.0]","(26.0, 29.125]"
345,24.0,3.0,13.0,4.0,"(22.0, 25.0]","(9.225, 13.0]"
626,57.0,9.0,12.35,3.0,"(50.0, 80.0]","(9.225, 13.0]"
535,7.0,0.0,26.25,6.0,"(0.42, 14.0]","(26.0, 29.125]"
706,45.0,8.0,13.5,4.0,"(42.0, 50.0]","(13.0, 15.75]"


In [34]:
clf = DecisionTreeClassifier()
clf.fit(X_train_trf,y_train)
y_pred2 = clf.predict(X_test_trf)

In [35]:
accuracy_score(y_test,y_pred2)

0.6223776223776224

In [36]:
X_trf = trf.fit_transform(X)
np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy'))

0.630281690140845

In [37]:
df = pd.read_csv('train.csv')[['Age','Fare','SibSp','Parch','Survived']]

In [38]:
df.dropna(inplace=True)

In [39]:
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived
0,22.0,7.25,1,0,0
1,38.0,71.2833,1,0,1
2,26.0,7.925,0,0,1
3,35.0,53.1,1,0,1
4,35.0,8.05,0,0,0


In [40]:
df['Family'] = df['SibSp'] + df['Parch']

In [41]:
df.head()

Unnamed: 0,Age,Fare,SibSp,Parch,Survived,Family
0,22.0,7.25,1,0,0,1
1,38.0,71.2833,1,0,1,1
2,26.0,7.925,0,0,1,0
3,35.0,53.1,1,0,1,1
4,35.0,8.05,0,0,0,0


In [43]:
df.drop(columns=['SibSp','Parch'],inplace=True)

In [44]:
df.head()

Unnamed: 0,Age,Fare,Survived,Family
0,22.0,7.25,0,1
1,38.0,71.2833,1,1
2,26.0,7.925,1,0
3,35.0,53.1,1,1
4,35.0,8.05,0,0


In [45]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [46]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [47]:
X_train.head()

Unnamed: 0,Age,Fare,Family
328,31.0,20.525,2
73,26.0,14.4542,1
253,30.0,16.1,1
719,33.0,7.775,0
666,25.0,13.0,0


In [48]:
# Without binarization
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.6083916083916084

In [49]:
np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy'))

0.654186228482003

In [50]:
# Applying Binarization
from sklearn.preprocessing import Binarizer

In [53]:
trf = ColumnTransformer([
	('bin',Binarizer(copy = False),['Family'])
],remainder='passthrough')

In [54]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [56]:
pd.DataFrame(X_train_trf,columns=['Family','Age','Fare'])

Unnamed: 0,Family,Age,Fare
0,1.0,31.0,20.5250
1,1.0,26.0,14.4542
2,1.0,30.0,16.1000
3,0.0,33.0,7.7750
4,0.0,25.0,13.0000
...,...,...,...
566,1.0,46.0,61.1750
567,0.0,25.0,13.0000
568,0.0,41.0,134.5000
569,1.0,33.0,20.5250


In [57]:
clf = DecisionTreeClassifier()
clf.fit(X_train_trf,y_train)
y_pred2 = clf.predict(X_test_trf)

accuracy_score(y_test,y_pred2)

0.6433566433566433

In [58]:
X_trf = trf.fit_transform(X)
np.mean(cross_val_score(DecisionTreeClassifier(),X_trf,y,cv=10,scoring='accuracy'))

0.6233568075117372