In [84]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [85]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer

In [86]:
df = pd.read_csv('train.csv', usecols = ['Age','Fare','Survived'])

In [87]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [88]:
df.dropna(inplace = True)

In [89]:
df

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.2500
1,1,38.0,71.2833
2,1,26.0,7.9250
3,1,35.0,53.1000
4,0,35.0,8.0500
...,...,...,...
885,0,39.0,29.1250
886,0,27.0,13.0000
887,1,19.0,30.0000
889,1,26.0,30.0000


In [90]:
x = df.drop(columns = ['Survived'])
y = df['Survived']

In [91]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size = 0.2)

In [92]:
X_train.head()

Unnamed: 0,Age,Fare
876,20.0,9.8458
53,29.0,26.0
243,22.0,7.125
782,29.0,30.0
331,45.5,28.5


In [93]:
X_train.describe()

Unnamed: 0,Age,Fare
count,571.0,571.0
mean,29.968196,34.528619
std,14.586918,52.438305
min,0.42,0.0
25%,20.25,8.05
50%,28.0,15.2458
75%,38.5,33.25
max,80.0,512.3292


In [94]:
clf1 = DecisionTreeClassifier()

In [98]:
clf.fit(X_train,y_train)
y_predict = clf.predict(X_test)

In [99]:
accuracy_score(y_predict,y_test)

0.6083916083916084

# Now by applying the Binning on the Age and Fare 

In [25]:
Kbin_age = KBinsDiscretizer(n_bins = 10, strategy = 'quantile', encode = 'ordinal')
Kbin_fare = KBinsDiscretizer(n_bins = 10, strategy = 'quantile', encode = 'ordinal')

In [30]:
trf = ColumnTransformer(transformers = [ ('first', Kbin_age,['Age']), ('second', Kbin_fare, ['Fare']) ])

In [32]:
X_train_transform = trf.fit_transform(X_train)
X_test_transform = trf.transform(X_test)

In [45]:
X_train_transform

array([[5., 6.],
       [2., 1.],
       [3., 2.],
       ...,
       [4., 0.],
       [5., 2.],
       [8., 0.]])

In [56]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42, 13.  , 18.  , 22.  , 25.  , 28.  , 32.  , 36.  , 41.  ,
              50.  , 80.  ])                                                ],
      dtype=object)

In [41]:
trf.named_transformers_['second'].bin_edges_

array([array([  0.    ,   7.75  ,   7.925 ,   9.5   ,  13.    ,  16.1   ,
               26.    ,  30.    ,  50.4958,  79.65  , 512.3292])         ],
      dtype=object)

In [57]:
output = pd.DataFrame({
    'age':X_train['Age'],
    'age_lable_no': X_train_transform[:,0],
    'fare': X_train['Fare'],
    'fare_lable_no':X_train_transform[:,1]
})

In [64]:
output 

Unnamed: 0,age,age_lable_no,fare,fare_lable_no,age_labels,fare_labels
801,31.0,5.0,26.2500,6.0,"(28.0, 32.0]","(26.0, 30.0]"
192,19.0,2.0,7.8542,1.0,"(18.0, 22.0]","(7.75, 7.925]"
225,22.0,3.0,9.3500,2.0,"(18.0, 22.0]","(7.925, 9.5]"
489,9.0,0.0,15.9000,4.0,"(0.42, 13.0]","(13.0, 16.1]"
581,39.0,7.0,110.8833,9.0,"(36.0, 41.0]","(79.65, 512.329]"
...,...,...,...,...,...,...
449,52.0,9.0,30.5000,7.0,"(50.0, 80.0]","(30.0, 50.496]"
406,51.0,9.0,7.7500,1.0,"(50.0, 80.0]","(0.0, 7.75]"
271,25.0,4.0,0.0000,0.0,"(22.0, 25.0]",
713,29.0,5.0,9.4833,2.0,"(28.0, 32.0]","(7.925, 9.5]"


In [63]:
output['age_labels'] = pd.cut(x = X_train['Age'], bins = trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_labels'] = pd.cut(x= X_train['Fare'],bins = trf.named_transformers_['second'].bin_edges_[0].tolist())

In [68]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transform,y_train)
y_predict = clf.predict(X_test_transform)

In [69]:
accuracy_score(y_predict,y_test)

0.7342657342657343

In [70]:
from sklearn.model_selection import cross_val_score

In [72]:
X_trf = trf.transform(x)

In [83]:
np.mean(cross_val_score(clf,X_trf,y,scoring ='accuracy', cv = 100))

0.7023214285714284