### Equal Width Binning Example

In [8]:
import pandas as pd
import numpy as np

In [4]:
age = [23, 44, 52, 67, 12, 20, 43, 66, 89, 54, 33]
df = pd.DataFrame({"Age":age})
df

Unnamed: 0,Age
0,23
1,44
2,52
3,67
4,12
5,20
6,43
7,66
8,89
9,54


In [7]:
df['Age_Binned'] = pd.cut(df["Age"], bins=3, labels=["young", "middle-ages", "senior"])
df

Unnamed: 0,Age,Age_Binned
0,23,young
1,44,middle-ages
2,52,middle-ages
3,67,senior
4,12,young
5,20,young
6,43,middle-ages
7,66,senior
8,89,senior
9,54,middle-ages


In [9]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [12]:
df = pd.read_csv('titanic.csv',usecols=['Age','Fare','Survived'])

In [14]:
df.dropna(inplace=True)
df.shape

(714, 3)

In [16]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [18]:
X = df.iloc[:,1:]
y = df.iloc[:,0]

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.head(2)

Unnamed: 0,Age,Fare
328,31.0,20.525
73,26.0,14.4542


In [22]:
# using decision tree as the model
clf = DecisionTreeClassifier()

# training the model
clf.fit(X_train,y_train)

#predicting
y_pred = clf.predict(X_test)

# testing the accuracy
accuracy_score(y_test,y_pred)

0.6293706293706294

In [47]:
# Equal Frequency / Quantile Binning
kbin_age = KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='quantile')
kbin_fare = KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='quantile')

In [48]:
trf = ColumnTransformer([
    ('first',kbin_age,[0]),
    ('second',kbin_fare,[1])
])

In [49]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [50]:
trf.named_transformers_

{'first': KBinsDiscretizer(encode='ordinal', n_bins=10),
 'second': KBinsDiscretizer(encode='ordinal', n_bins=10)}

In [52]:
trf.named_transformers_["first"].bin_edges_

array([array([ 0.42, 14.  , 19.  , 22.  , 25.  , 28.5 , 32.  , 36.  , 42.  ,
              50.  , 80.  ])                                                ],
      dtype=object)

In [57]:
output = pd.DataFrame({
    'age':X_train['Age'],
    'age_trf':X_train_trf[:,0],
    'fare':X_train['Fare'],
    'fare_trf':X_train_trf[:,1]
})
output.sample(7)

Unnamed: 0,age,age_trf,fare,fare_trf
322,30.0,5.0,12.35,3.0
113,20.0,2.0,9.825,3.0
432,42.0,8.0,26.0,6.0
681,27.0,4.0,76.7292,8.0
255,29.0,5.0,15.2458,4.0
372,19.0,2.0,8.05,2.0
348,3.0,0.0,15.9,5.0


In [60]:
trf.named_transformers_['first'].bin_edges_[0].tolist()

[0.42, 14.0, 19.0, 22.0, 25.0, 28.5, 32.0, 36.0, 42.0, 50.0, 80.0]

In [58]:
output['age_labels'] = pd.cut(x=X_train['Age'],
                                    bins=trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_labels'] = pd.cut(x=X_train['Fare'],
                                    bins=trf.named_transformers_['second'].bin_edges_[0].tolist())

In [59]:
output.sample(5)

Unnamed: 0,age,age_trf,fare,fare_trf,age_labels,fare_labels
44,19.0,2.0,7.8792,1.0,"(14.0, 19.0]","(7.75, 7.896]"
37,21.0,2.0,8.05,2.0,"(19.0, 22.0]","(7.896, 9.225]"
342,28.0,4.0,13.0,4.0,"(25.0, 28.5]","(9.225, 13.0]"
519,32.0,6.0,7.8958,2.0,"(28.5, 32.0]","(7.75, 7.896]"
116,70.5,9.0,7.75,1.0,"(50.0, 80.0]","(0.0, 7.75]"
