In [1]:
import pandas as pd
import numpy as np

In [3]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [5]:
df = pd.read_csv('train.csv',usecols=['Age','Fare','Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [8]:
#removed missing values
df.dropna(inplace=True)

In [9]:
df.shape

(714, 3)

In [10]:
x = df.iloc[:,1:]
x

Unnamed: 0,Age,Fare
0,22.0,7.2500
1,38.0,71.2833
2,26.0,7.9250
3,35.0,53.1000
4,35.0,8.0500
...,...,...
885,39.0,29.1250
886,27.0,13.0000
887,19.0,30.0000
889,26.0,30.0000


In [11]:
y = df['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
885    0
886    0
887    1
889    1
890    0
Name: Survived, Length: 714, dtype: int64

In [17]:
x_train , x_test ,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [18]:
x_train.head(2)

Unnamed: 0,Age,Fare
328,31.0,20.525
73,26.0,14.4542


In [19]:
clf = DecisionTreeClassifier()
clf.fit(x_train,y_train)

In [20]:
y_pred = clf.predict(x_test)

In [21]:
accuracy_score(y_test,y_pred)

0.6293706293706294

In [22]:
np.mean(cross_val_score(DecisionTreeClassifier(),x,y,cv=10,scoring='accuracy'))

0.6303012519561815

In [23]:
kbin_age = KBinsDiscretizer(n_bins=5,encode='ordinal',strategy='kmeans')
kbin_fare = KBinsDiscretizer(n_bins=5,encode='ordinal',strategy='kmeans')

In [24]:
trf = ColumnTransformer([
    ('first',kbin_age,[0]),
    ('second',kbin_fare,[1])
])

In [25]:
x_train_trf = trf.fit_transform(x_train)
x_test_trf = trf.transform(x_test)

In [27]:
trf.named_transformers_['first'].n_bins_

array([5])

In [28]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42      , 12.69636862, 27.02765256, 39.35418895, 54.21464646,
              80.        ])                                                   ],
      dtype=object)

In [29]:
trf.named_transformers_['second'].n_bins_

array([5])

In [32]:
output = pd.DataFrame({
    'age':x_train['Age'],
    'age_trf':x_train_trf[:,0],
    'fare':x_train['Fare'],
    'fare_trf':x_train_trf[:,1]
})

In [33]:
output['age_labels'] = pd.cut(x=x_train['Age'],
                                                bins = trf.named_transformers_['first'].bin_edges_[0].tolist())

In [34]:
output['fare_labels'] = pd.cut(x=x_train['Fare'],
                                                bins = trf.named_transformers_['second'].bin_edges_[0].tolist())

In [39]:
output.sample(5)

Unnamed: 0,age,age_trf,fare,fare_trf,age_labels,fare_labels
594,37.0,2.0,26.0,0.0,"(27.028, 39.354]","(0.0, 42.424]"
177,50.0,3.0,28.7125,0.0,"(39.354, 54.215]","(0.0, 42.424]"
787,8.0,0.0,29.125,0.0,"(0.42, 12.696]","(0.0, 42.424]"
4,35.0,2.0,8.05,0.0,"(27.028, 39.354]","(0.0, 42.424]"
550,17.0,1.0,110.8833,2.0,"(12.696, 27.028]","(100.624, 186.5]"


In [40]:
clf1 = DecisionTreeClassifier()
clf.fit(x_train_trf,y_train)
y_pred2 = clf.predict(x_test_trf)

In [42]:
accuracy_score(y_test,y_pred2)

0.6223776223776224