### Binning

In [35]:
import pandas as pd
import numpy as np

In [36]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [37]:
url ="https://raw.githubusercontent.com/campusx-official/100-days-of-machine-learning/main/day30-function-transformer/train.csv"
df = pd.read_csv(url, usecols=['Age','Fare','Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [38]:
df.dropna(inplace=True)

In [39]:
df.shape

(714, 3)

In [40]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)

In [42]:
X_train.head(3)

Unnamed: 0,Age,Fare
328,31.0,20.525
73,26.0,14.4542
253,30.0,16.1


In [43]:
clf = DecisionTreeClassifier()

In [44]:
clf.fit(X_train, y_train)

In [45]:
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.6363636363636364

In [46]:
np.mean(cross_val_score(clf, X, y, cv=10, scoring='accuracy'))

0.6289123630672926

In [47]:
kbin_age = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='kmeans')
kbin_fare = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='kmeans')

In [48]:
kbin_age = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
kbin_fare = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')

In [49]:
trf = ColumnTransformer([
    ('first', kbin_age,[0]),
    ('second', kbin_fare, [1])
])

In [50]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [51]:
trf.named_transformers_['first'].n_bins_

array([10])

In [52]:
trf.named_transformers_

{'first': KBinsDiscretizer(encode='ordinal', n_bins=10),
 'second': KBinsDiscretizer(encode='ordinal', n_bins=10)}

In [53]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42, 14.  , 19.  , 22.  , 25.  , 28.5 , 32.  , 36.  , 42.  ,
              50.  , 80.  ])                                                ],
      dtype=object)

In [54]:
trf.named_transformers_['second'].n_bins_

array([10])

In [55]:
trf.named_transformers_['second'].bin_edges_

array([array([  0.    ,   7.75  ,   7.8958,   9.225 ,  13.    ,  15.75  ,
               26.    ,  29.125 ,  51.4792,  82.1708, 512.3292])         ],
      dtype=object)

In [56]:
output = pd.DataFrame({
    'age':X_train['Age'],
    'age_trf':X_train_trf[:,0],
    'fare':X_train['Fare'],
    'fare_trf': X_train_trf[:,1]
})

In [58]:
output['age_labels'] = pd.cut(x =X_train['Age'], bins = trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_labels']  = pd.cut(x = X_train['Fare'], bins=trf.named_transformers_['second'].bin_edges_[0].tolist())

In [59]:
output.sample(5)

Unnamed: 0,age,age_trf,fare,fare_trf,age_labels,fare_labels
236,44.0,8.0,26.0,6.0,"(42.0, 50.0]","(15.75, 26.0]"
430,28.0,4.0,26.55,6.0,"(25.0, 28.5]","(26.0, 29.125]"
681,27.0,4.0,76.7292,8.0,"(25.0, 28.5]","(51.479, 82.171]"
580,25.0,4.0,30.0,7.0,"(22.0, 25.0]","(29.125, 51.479]"
305,0.92,0.0,151.55,9.0,"(0.42, 14.0]","(82.171, 512.329]"


In [60]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred2 = clf.predict(X_test_trf)



In [61]:
accuracy_score(y_test, y_pred)

0.6363636363636364

In [62]:
X_trf = trf.fit_transform(X)
np.mean(cross_val_score(DecisionTreeClassifier(), X, y, cv=10, scoring='accuracy'))

0.6275234741784037

### Binarization

In [64]:
df = pd.read_csv(url, usecols=['Age','Fare','SibSp','Parch' ,'Survived'])
df.dropna(inplace=True)

In [65]:
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare
0,0,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,1,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,0,35.0,0,0,8.05


In [66]:
df['family'] = df['SibSp'] + df['Parch']

In [67]:
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,family
0,0,22.0,1,0,7.25,1
1,1,38.0,1,0,71.2833,1
2,1,26.0,0,0,7.925,0
3,1,35.0,1,0,53.1,1
4,0,35.0,0,0,8.05,0


In [69]:
df.drop(columns=['SibSp', 'Parch'], inplace= True)

In [70]:
df.head()

Unnamed: 0,Survived,Age,Fare,family
0,0,22.0,7.25,1
1,1,38.0,71.2833,1
2,1,26.0,7.925,0
3,1,35.0,53.1,1
4,0,35.0,8.05,0


In [71]:
X = df.drop(columns=['Survived'])
y = df['Survived']

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=42)

In [73]:
# Without Binarization

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.6293706293706294

In [74]:
np.mean(cross_val_score(clf, X, y, cv=10, scoring='accuracy'))

0.6512910798122066

In [75]:
# Applying binarix=zation

from sklearn.preprocessing import Binarizer

In [76]:
trf = ColumnTransformer([
    ('bin', Binarizer(copy=False), ['family'])
], remainder='passthrough')

In [77]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [78]:
pd.DataFrame(X_train_trf, columns= ['family', 'Age', 'Fare'])

Unnamed: 0,family,Age,Fare
0,1.0,31.0,20.5250
1,1.0,26.0,14.4542
2,1.0,30.0,16.1000
3,0.0,33.0,7.7750
4,0.0,25.0,13.0000
...,...,...,...
566,1.0,46.0,61.1750
567,0.0,25.0,13.0000
568,0.0,41.0,134.5000
569,1.0,33.0,20.5250


In [79]:
clf = DecisionTreeClassifier()
clf.fit(X_train_trf, y_train)

y_pred2 = clf.predict(X_test_trf)
accuracy_score(y_test, y_pred2)

0.6363636363636364

In [80]:
X_trf = trf.fit_transform(X)
np.mean(cross_val_score(clf, X_trf, y, cv=10, scoring = 'accuracy'))

0.6275821596244132