In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit,GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,VotingRegressor,AdaBoostRegressor,GradientBoostingRegressor,StackingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import r2_score
import pickle as pkl

In [2]:
df = pd.read_csv("prepared_data.csv")

In [3]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,228,229,230,231,232,233,234,235,236,price
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,2.0,1250.0,40.0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,2.0,1200.0,83.0
2,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,2.0,1170.0,40.0
3,3,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,3.0,1425.0,65.0
4,4,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,2.0,947.0,43.0
5,5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,2.0,1130.0,36.0
6,6,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,3.0,1417.0,76.0
7,7,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,2.0,1230.0,58.0
8,8,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,3.0,3.0,1705.0,75.0
9,9,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,2.0,2.0,1116.0,47.0


In [4]:
x = df.drop(columns=['Unnamed: 0','price'])
y = df.price

In [5]:
x.shape

(6965, 237)

In [6]:
y.shape

(6965,)

In [7]:
# import seaborn as sns

In [8]:
# sns.scatterplot(data=df,x='area',y='price',hue='bed')

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [10]:
x_test.shape

(1393, 237)

In [11]:
ss = StandardScaler()

x_train_scaled = ss.fit_transform(x_train.values)
x_test_scaled = ss.transform(x_test.values)

In [12]:
x_scaled = ss.transform(x.values)

In [13]:
def adj_r2(r2):
    return 1-(1-r2)*(y_test.shape[0]-1)/(y_test.shape[0]-1-x_train.shape[1])

### LinearRegression

In [15]:
lr = LinearRegression()

# 1
# lr.fit(x_train_scaled,y_train)

# ypred_train = lr.predict(x_train_scaled)
# ypred_test = lr.predict(x_test_scaled)

# print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
# print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
scores = cross_val_score(lr,x.values,y.values,cv=cv)

a = np.array(list(map(adj_r2,scores)))

print(f"R2 scores : {scores}")
print(f"Adjusted R2 scores : {a}")


R2 scores : [0.8229865  0.82083143 0.83474364 0.82530795 0.83380624]
Adjusted R2 scores : [0.78666425 0.78406697 0.80083389 0.78946205 0.79970414]


### Ridge

In [21]:
ridge = Ridge()

# 1
# ridge.fit(x_train_scaled,y_train)

# ypred_train = ridge.predict(x_train_scaled)
# ypred_test = ridge.predict(x_test_scaled)

# print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
# print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(ridge,x_scaled,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")

param = {
    'alpha' : [10,1,0.1,0.01],
    'solver' : ['lsqr','sag','svd','auto'],
    'max_iter' : [5000,8000]
}

model = GridSearchCV(ridge,param,cv=3,scoring='r2',n_jobs=-1,verbose=10)

model.fit(x_train_scaled,y_train)


Fitting 3 folds for each of 32 candidates, totalling 96 fits


In [22]:
print(model.best_score_)
print(model.best_params_)

0.8412956740504497
{'alpha': 10, 'max_iter': 5000, 'solver': 'svd'}


In [27]:
new_ridge = Ridge(alpha= 10, max_iter= 5000, solver= 'svd')

new_ridge.fit(x_train_scaled,y_train)

ypred_train = new_ridge.predict(x_train_scaled)
ypred_test = new_ridge.predict(x_test_scaled)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

r2_score for train data : 0.8395289626518142
r2_score for test data : 0.8095142995105532


### Lasso

In [34]:
lasso = Lasso()

# 1
# lasso.fit(x_train_scaled,y_train)

# ypred_train = lasso.predict(x_train_scaled)
# ypred_test = lasso.predict(x_test_scaled)

# print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
# print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(lasso,x_scaled,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")

# 3
param = {
    'alpha' : [10,1,0.1,0.01],
    'selection' : ['random','cyclic'],
    'max_iter' : [5000,8000]
}

model = GridSearchCV(lasso,param,cv=3,scoring='r2',n_jobs=-1,verbose=10)

model.fit(x_train_scaled,y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [35]:
print(model.best_score_)
print(model.best_params_)

0.8420092144218949
{'alpha': 0.1, 'max_iter': 5000, 'selection': 'random'}


In [41]:
new_lasso = Lasso(alpha= 0.1, max_iter= 5000,selection='random')

new_lasso.fit(x_train_scaled,y_train)

ypred_train = new_lasso.predict(x_train_scaled)
ypred_test = new_lasso.predict(x_test_scaled)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

r2_score for train data : 0.8379534315354071
r2_score for test data : 0.8079245514218018


### ElasticNet

In [39]:
en = ElasticNet()

# 1
# en.fit(x_train_scaled,y_train)

# ypred_train = en.predict(x_train_scaled)
# ypred_test = en.predict(x_test_scaled)

# print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
# print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(en,x_scaled,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")

param = {
    'alpha' : [10,1,0.1,0.01],
    'selection' : ['random','cyclic'],
    'l1_ratio' : [0.1,0.3,0.5,0.8],
    'max_iter' : [5000,8000]
}

model = GridSearchCV(en,param,cv=3,scoring='r2',n_jobs=-1,verbose=10)

model.fit(x_train_scaled,y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


In [40]:
print(model.best_score_)
print(model.best_params_)

0.8424404657884624
{'alpha': 0.1, 'l1_ratio': 0.8, 'max_iter': 8000, 'selection': 'random'}


In [42]:
new_en = ElasticNet(alpha= 0.1,l1_ratio=0.8, max_iter= 8000, selection= 'random')

new_en.fit(x_train_scaled,y_train)

ypred_train = new_en.predict(x_train_scaled)
ypred_test = new_en.predict(x_test_scaled)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

r2_score for train data : 0.8323294568914517
r2_score for test data : 0.801929030998412


### SGDRegressor

In [45]:
sgd = SGDRegressor(verbose=10)
# 1
# sgd.fit(x_train_scaled,y_train)

# ypred_train = sgd.predict(x_train_scaled)
# ypred_test = sgd.predict(x_test_scaled)

# print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
# print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(sgd,x.values,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")

# 3
param = {
    'loss' : ['huber','squared_epsilon_insensitive'],
    'max_iter' : [8000,10000],
    'alpha' : [0.0001,0.01,0.001],
    'penalty' : ['l2','elasticnet'],
    'learning_rate' : ['optimal','constant','invscaling'],
    'eta0' : [0.01,0.1,0.001]
}

model = GridSearchCV(sgd,param,cv=3,scoring='r2',n_jobs=-1,verbose=10)

model.fit(x_train_scaled,y_train)

Fitting 3 folds for each of 216 candidates, totalling 648 fits
-- Epoch 1
Norm: 55.70, NNZs: 236, Bias: 86.777366, T: 5572, Avg. loss: 4833.150056
Total training time: 0.01 seconds.
-- Epoch 2
Norm: 63.86, NNZs: 237, Bias: 99.162496, T: 11144, Avg. loss: 1391.661876
Total training time: 0.02 seconds.
-- Epoch 3
Norm: 66.67, NNZs: 237, Bias: 102.455951, T: 16716, Avg. loss: 1164.905405
Total training time: 0.03 seconds.
-- Epoch 4
Norm: 68.73, NNZs: 237, Bias: 103.508484, T: 22288, Avg. loss: 1114.452129
Total training time: 0.04 seconds.
-- Epoch 5
Norm: 70.58, NNZs: 236, Bias: 104.105290, T: 27860, Avg. loss: 1092.221967
Total training time: 0.04 seconds.
-- Epoch 6
Norm: 71.04, NNZs: 237, Bias: 104.128108, T: 33432, Avg. loss: 1083.989006
Total training time: 0.06 seconds.
-- Epoch 7
Norm: 71.12, NNZs: 236, Bias: 104.066425, T: 39004, Avg. loss: 1076.294173
Total training time: 0.07 seconds.
-- Epoch 8
Norm: 71.78, NNZs: 237, Bias: 104.147374, T: 44576, Avg. loss: 1075.739328
Total t

In [47]:
print(f"best score : {model.best_score_}")
print(f"best params : {model.best_params_}")

best score : 0.84162325328104
best params : {'alpha': 0.01, 'eta0': 0.001, 'learning_rate': 'invscaling', 'loss': 'squared_epsilon_insensitive', 'max_iter': 8000, 'penalty': 'elasticnet'}


In [48]:
new_sgd = SGDRegressor(alpha= 0.01,eta0=0.001,learning_rate='invscaling',loss='squared_epsilon_insensitive', max_iter= 8000, penalty= 'elasticnet')

new_sgd.fit(x_train_scaled,y_train)

ypred_train = new_sgd.predict(x_train_scaled)
ypred_test = new_sgd.predict(x_test_scaled)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

r2_score for train data : 0.8389004220308391
r2_score for test data : 0.8092156063469297


### KNeighborsRegressor

In [52]:
knn = KNeighborsRegressor()

# 1
# knn.fit(x_train_scaled,y_train)

# ypred_train = knn.predict(x_train_scaled)
# ypred_test = knn.predict(x_test_scaled)

# print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
# print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(knn,x_scaled,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")

param = {
    'n_neighbors' : [5,10,15,2],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree'],
    'weights' : ['uniform','distance']      
}

model = GridSearchCV(knn,param,cv=3,scoring='r2',n_jobs=-1,verbose=10)

model.fit(x_train_scaled,y_train)

Fitting 3 folds for each of 24 candidates, totalling 72 fits


In [53]:
print(f"best score : {model.best_score_}")
print(f"best params : {model.best_params_}")

best score : 0.7826828588374369
best params : {'algorithm': 'kd_tree', 'n_neighbors': 2, 'weights': 'distance'}


In [63]:
new_knn = KNeighborsRegressor(algorithm='kd_tree',n_neighbors=2,weights='distance',n_jobs=-1)

new_knn.fit(x_train_scaled,y_train)

ypred_train = new_knn.predict(x_train_scaled)
ypred_test = new_knn.predict(x_test_scaled)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

r2_score for train data : 0.9859083589338933
r2_score for test data : 0.8443713860507089


### DecisionTreeRegressor

In [23]:
dt = DecisionTreeRegressor()

# 1
# dt.fit(x_train.values,y_train)

# ypred_train = dt.predict(x_train.values)
# ypred_test = dt.predict(x_test.values)

# print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
# print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(dt,x.values,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")

param = {
    'criterion' : ['friedman_mse','squared_error','absolute_error'],
    'splitter' : ['best','random'],
    'max_depth':[10,15,20,30],
    'min_samples_split' : [5,10,15],
    'max_features' : [150,200,237],
}

model = GridSearchCV(dt,param,cv=3,scoring='r2',n_jobs=-1,verbose=10)

model.fit(x.values,y)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


In [None]:
print(f"best score : {model.best_score_}")
print(f"best params : {model.best_params_}")

In [11]:
new_dt = DecisionTreeRegressor(criterion='absolute_error',max_depth=20,max_features=237,min_samples_split=15,splitter='best')

new_dt.fit(x_train.values,y_train)

ypred_train = new_dt.predict(x_train.values)
ypred_test = new_dt.predict(x_test.values)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

r2_score for train data : 0.9282087145507211
r2_score for test data : 0.7826261366627937


### Voting Regressor

In [19]:
vr = VotingRegressor(estimators=[
    ('sgd',SGDRegressor(alpha= 0.01,eta0=0.001,learning_rate='invscaling',loss='squared_epsilon_insensitive', max_iter= 8000, penalty= 'elasticnet')),
    ('knn',KNeighborsRegressor(algorithm='kd_tree',n_neighbors=2,weights='distance',n_jobs=-1)),
    ('dt',DecisionTreeRegressor(criterion='absolute_error',max_depth=20,max_features=237,min_samples_split=15,splitter='best'))
],n_jobs=-1,verbose=True)

# 1
vr.fit(x_train_scaled,y_train)

ypred_train = vr.predict(x_train_scaled)
ypred_test = vr.predict(x_test_scaled)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# # 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(vr,x_scaled,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")


r2_score for train data : 0.9558437081117209
r2_score for test data : 0.8634832446479119


### AdaBoostRegressor

In [25]:
ab = AdaBoostRegressor()

# 1
# ab.fit(x_train.values,y_train)

# ypred_train = ab.predict(x_train.values)
# ypred_test = ab.predict(x_test.values)

# print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
# print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(ab,x.values,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")

param = {
    'estimator' : [DecisionTreeRegressor(max_depth=10),DecisionTreeRegressor(max_depth=8),DecisionTreeRegressor(max_depth=6)],
    'n_estimators' : [50,100,150,200],
    'loss' : ['linear','square','exponential'],
    'learning_rate' : [1,0.1,0.01]
}

model = GridSearchCV(ab,param,scoring='r2',cv=3,n_jobs=-1,verbose=10)

model.fit(x.values,y.values)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


In [26]:
print(f"best score : {model.best_score_}")
print(f"best params : {model.best_params_}")

best score : 0.6965254319730659
best params : {'estimator': DecisionTreeRegressor(max_depth=6), 'learning_rate': 0.1, 'loss': 'exponential', 'n_estimators': 150}


In [33]:
new_ab = AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=6),learning_rate=0.1,loss='exponential',n_estimators=150)

new_ab.fit(x_train.values,y_train)

ypred_train = new_ab.predict(x_train.values)
ypred_test = new_ab.predict(x_test.values)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

r2_score for train data : 0.8110148510374596
r2_score for test data : 0.7532932568755784


### GradientBoostingRegressor

In [37]:
gb = GradientBoostingRegressor()

# 1
# gb.fit(x_train.values,y_train)

# ypred_train = gb.predict(x_train.values)
# ypred_test = gb.predict(x_test.values)

# print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
# print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(gb,x.values,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")

param = {
    'n_estimators' : [50,100,150,200],
    'loss' : ['squared_error','absolute_error','huber'],
    'learning_rate' : [1,0.1,0.01],
    'criterion' : ['friedman_mse','squared_error'],
    'max_depth' : [3,5,2],
    'max_features' : ['sqrt','log2']
}

model = GridSearchCV(gb,param,scoring='r2',cv=3,n_jobs=-1,verbose=10)

model.fit(x.values,y.values)

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


In [38]:
print(f"best score : {model.best_score_}")
print(f"best params : {model.best_params_}")

best score : 0.6656471606857979
best params : {'criterion': 'squared_error', 'learning_rate': 1, 'loss': 'huber', 'max_depth': 5, 'max_features': 'sqrt', 'n_estimators': 200}


In [44]:
new_gb = GradientBoostingRegressor(criterion='squared_error',loss='huber',learning_rate=1,max_depth=4,max_features='sqrt',n_estimators=160)

new_gb.fit(x_train_scaled,y_train)

ypred_train = new_gb.predict(x_train_scaled)
ypred_test = new_gb.predict(x_test_scaled)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

r2_score for train data : 0.9231149273825147
r2_score for test data : 0.8679075758796653


### StackingRegressor

In [14]:
sr = StackingRegressor(estimators=[
    ('sgd',SGDRegressor(alpha= 0.01,eta0=0.001,learning_rate='invscaling',loss='squared_epsilon_insensitive', max_iter= 8000, penalty= 'elasticnet')),
    ('knn',KNeighborsRegressor(algorithm='kd_tree',n_neighbors=2,weights='distance',n_jobs=-1)),
    ('dt',DecisionTreeRegressor(criterion='absolute_error',max_depth=20,max_features=237,min_samples_split=15,splitter='best'),
    ('ab',AdaBoostRegressor(estimator=DecisionTreeRegressor(max_depth=6),learning_rate=0.1,loss='exponential',n_estimators=150)))],
    final_estimator=GradientBoostingRegressor(),n_jobs=-1,verbose=True)

# 1
sr.fit(x_train_scaled,y_train)

ypred_train = sr.predict(x_train_scaled)
ypred_test = sr.predict(x_test_scaled)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(sr,x_scaled,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")

r2_score for train data : 0.9382615649941005
r2_score for test data : 0.8954708862225214


### RandomForestRegressor

In [66]:
rf = RandomForestRegressor(n_jobs=-1)

# 1
# rf.fit(x_train.values,y_train)

# ypred_train = rf.predict(x_train.values)
# ypred_test = rf.predict(x_test.values)

# print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
# print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

# 2
# cv = ShuffleSplit(n_splits=5,test_size=0.2,random_state=10)
# scores = cross_val_score(rf,x.values,y.values,cv=cv)

# a = np.array(list(map(adj_r2,scores)))

# print(f"R2 scores : {scores}")
# print(f"Adjusted R2 scores : {a}")

param = {
    'n_estimators' : [100,150,200,250,300],
    'criterion' : ['squared_error','friedman_mse','poisson'],
    'max_features' : ['sqrt','log2'],
    'max_depth' : [10,15,20,25]
}

model = GridSearchCV(rf,param,scoring='r2',cv=3,n_jobs=-1,verbose=10)

model.fit(x.values,y.values)

Fitting 3 folds for each of 120 candidates, totalling 360 fits


In [67]:
print(f"best score : {model.best_score_}")
print(f"best params : {model.best_params_}")

best score : 0.6434984482339166
best params : {'criterion': 'squared_error', 'max_depth': 25, 'max_features': 'sqrt', 'n_estimators': 100}


In [73]:
new_rf = RandomForestRegressor(criterion='squared_error',max_depth=25,max_features='sqrt',n_estimators=100)

new_rf.fit(x_train.values,y_train)

ypred_train = new_rf.predict(x_train.values)
ypred_test = new_rf.predict(x_test.values)

print(f"r2_score for train data : {r2_score(ypred_train,y_train)}")
print(f"r2_score for test data : {r2_score(ypred_test,y_test)}")

r2_score for train data : 0.8756713371653065
r2_score for test data : 0.7544009889404104


In [15]:
from sklearn.pipeline import make_pipeline
from sklearn import set_config
set_config(display='diagram')

In [16]:
trf1 = pkl.load(open('ohe.pkl','rb'))

In [17]:
trf2 = ss

In [18]:
trf3 = sr

In [19]:
pipe = make_pipeline(trf1,trf2,trf3)

In [20]:
pipe.named_steps

{'columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe',
                                  OneHotEncoder(dtype=<class 'numpy.int32'>,
                                                sparse_output=False),
                                  [0])]),
 'standardscaler': StandardScaler(),
 'stackingregressor': StackingRegressor(estimators=[('sgd',
                                SGDRegressor(alpha=0.01, eta0=0.001,
                                             loss='squared_epsilon_insensitive',
                                             max_iter=8000,
                                             penalty='elasticnet')),
                               ('knn',
                                KNeighborsRegressor(algorithm='kd_tree',
                                                    n_jobs=-1, n_neighbors=2,
                                                    weights='distance')),
                               ('dt',
                           

In [21]:
pkl.dump(pipe,open('model.pkl','wb'))