In [39]:
import numpy as np
import pandas as pd

import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer

from sklearn.metrics import r2_score

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier

In [7]:
df = pd.read_csv('Social_Network_Ads.csv')

In [8]:
df.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0
2,15668575,Female,26,43000,0
3,15603246,Female,27,57000,0
4,15804002,Male,19,76000,0


In [25]:
# train test split
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.15, random_state=42)


In [15]:
df.describe()

Unnamed: 0,User ID,Age,EstimatedSalary,Purchased
count,400.0,400.0,400.0,400.0
mean,15691540.0,37.655,69742.5,0.3575
std,71658.32,10.482877,34096.960282,0.479864
min,15566690.0,18.0,15000.0,0.0
25%,15626760.0,29.75,43000.0,0.0
50%,15694340.0,37.0,70000.0,0.0
75%,15750360.0,46.0,88000.0,1.0
max,15815240.0,60.0,150000.0,1.0


In [62]:
# binning of age and salary and transforming gender

trf = ColumnTransformer([
    ('age_encoded', KBinsDiscretizer(n_bins=10, encode='onehot', strategy='quantile'), ['Age']),
    ('salary_encoded', KBinsDiscretizer(n_bins=10, encode='onehot', strategy='quantile'), ['EstimatedSalary']),
    ('gender_encoded', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ['Gender'])
], remainder='passthrough')

In [63]:
# transforming the cols

x_train_transformed = trf.fit_transform(x_train)
x_test_transformed = trf.transform(x_test)

In [64]:
clf = DecisionTreeClassifier()

clf.fit(x_train_transformed, y_train)
pred = clf.predict(x_test_transformed)
print(accuracy_score(y_test, pred))
print(np.mean(cross_val_score(estimator=clf, X=x_train_transformed, y=y_train, cv=10)))

0.8666666666666667
0.8382352941176471


### using reg model or scoring or both on classification problem - bad score  like below

In [50]:
clf = LinearRegression()

clf.fit(x_train_transformed, y_train)
pred = clf.predict(x_test_transformed)
print(r2_score(y_test, pred))
print(np.mean(cross_val_score(estimator=clf, X=x_train_transformed, y=y_train, cv=10)))

0.7456333240700953
0.5535184343735079


In [77]:
trf.named_transformers_['age_encoded'].n_bins_

array([10])

In [78]:
trf.named_transformers_['age_encoded'].bin_edges_

array([array([18., 25., 28., 32., 35., 37., 40., 42., 47., 52., 60.])],
      dtype=object)