In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import scipy.stats as stats

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer


In [3]:
df = pd.read_csv('train.csv', usecols=['Age', 'Fare', 'Survived'])

df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [5]:
x = df.iloc[:,1:]
y = df.iloc[:,0]

In [9]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state = 42 )

In [12]:
x_train.head()

Unnamed: 0,Age,Fare
331,45.5,28.5
733,23.0,13.0
382,32.0,7.925
704,26.0,7.8542
813,6.0,31.275


In [15]:
clf = DecisionTreeClassifier()

In [17]:
clf.fit(x_train,y_train)
y_pred = clf.predict(x_test)

In [20]:
accuracy_score(y_test,y_pred)

0.659217877094972

In [23]:
np.mean(cross_val_score(DecisionTreeClassifier(),x,y,cv=10,scoring='accuracy'))

0.6599875156054932

In [24]:
kbin_age = KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='quantile')
kbin_fare = KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='quantile')

In [27]:
trf = ColumnTransformer([
    ('first',kbin_age,[0]),
    ('second',kbin_fare,[1])
])

In [31]:
from sklearn.impute import SimpleImputer

# NaN ko fill karne ke liye
imputer = SimpleImputer(strategy='mean')  
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)

x_train_trf = trf.fit_transform(x_train)
x_test_trf = trf.transform(x_test)



In [32]:
trf.named_transformers_['first'].n_bins_

array([9])

In [33]:
trf.named_transformers_['first'].bin_edges_

array([array([ 0.42      , 16.        , 21.        , 24.        , 28.        ,
              29.49884615, 32.        , 38.        , 47.        , 80.        ])],
      dtype=object)

In [39]:
from sklearn.preprocessing import KBinsDiscretizer

# x_train ek numpy array hai, to indexing number se karenge
x_train_selected = x_train[:, [0, 1]]   # suppose 0 = Age, 1 = Fare

# Transformer define karo
transformer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')

# Fit and transform
x_train_trf = transformer.fit_transform(x_train_selected)

# Output banao
output = pd.DataFrame({
    'age': x_train_selected[:, 0],
    'age_trf': x_train_trf[:, 0],
    'fare': x_train_selected[:, 1],
    'fare_trf': x_train_trf[:, 1]
})

output.head()


Unnamed: 0,age,age_trf,fare,fare_trf
0,45.5,2.0,28.5,0.0
1,23.0,1.0,13.0,0.0
2,32.0,1.0,7.925,0.0
3,26.0,1.0,7.8542,0.0
4,6.0,0.0,31.275,0.0


In [42]:
clf = DecisionTreeClassifier()
clf.fit(x_train_trf,y_train)
y_pred2 = clf.predict(x_test_trf)

In [43]:
accuracy_score(y_test,y_pred2)

0.547486033519553

In [50]:
from sklearn.impute import SimpleImputer

# Pehle NaN ko fill karlo
imputer = SimpleImputer(strategy='mean')  # ya 'median' ya 'most_frequent'
x_filled = imputer.fit_transform(x)

# Ab discretizer lagao
x_trf = trf.fit_transform(x_filled)
np.mean(cross_val_score(DecisionTreeClassifier(), x_trf, y, cv=10, scoring="accuracy"))




0.6825468164794009

In [51]:
def discretize(bins,strategy):
    kbin_age = KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy=strategy)
    kbin_fare = KBinsDiscretizer(n_bins=bins,encode='ordinal',strategy=strategy)
    
    trf = ColumnTransformer([
        ('first',kbin_age,[0]),
        ('second',kbin_fare,[1])
    ])
    
    x_trf = trf.fit_transform(x)
    print(np.mean(cross_val_score(DecisionTreeClassifier(),x,y,cv=10,scoring='accuracy')))
    
    plt.figure(figsize=(14,4))
    plt.subplot(121)
    plt.hist(x['Age'])
    plt.title("Before")

    plt.subplot(122)
    plt.hist(x_trf[:,0],color='red')
    plt.title("After")

    plt.show()
    
    plt.figure(figsize=(14,4))
    plt.subplot(121)
    plt.hist(x['Fare'])
    plt.title("Before")

    plt.subplot(122)
    plt.hist(x_trf[:,1],color='red')
    plt.title("Fare")

    plt.show()

In [57]:

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer
import numpy as np

def discretize(bins, strategy):
    # Create an imputer to handle missing values
    imputer = SimpleImputer(strategy='mean')  # You can change the strategy to 'median' or 'most_frequent' if needed

    # Apply the imputer to fill missing values in your data
    x_filled = imputer.fit_transform(x)  # 'x' is your input data

    # Create KBinsDiscretizer
    kbin_age = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy=strategy)
    kbin_fare = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy=strategy)

    # Apply KBinsDiscretizer after imputing missing values
    trf = ColumnTransformer([
        ('first', kbin_age, [0]),
        ('second', kbin_fare, [1])
    ])

    # Transform the data using ColumnTransformer
    x_trf = trf.fit_transform(x_filled)

    print(np.mean(cross_val_score(DecisionTreeClassifier(), x_trf, y, cv=10, scoring='accuracy')))

# Example call to the function
discretize(10, 'quantile')


0.6825468164794009


