In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
## To see all columns of the dataset

pd.set_option("display.max_columns", None)

In [3]:
Table = pd.DataFrame(columns=['Model_Name','Adjusted_R2_Train','Adjusted_R2_Test','Parameters'])
def insertData(Table, model_name, train_R2, test_R2, parameter='default'):
    temp = pd.DataFrame()
    temp['Model_Name'] = [model_name]
    temp['Adjusted_R2_Train'] = train_R2
    temp['Adjusted_R2_Test'] = test_R2
    temp['Parameters'] = parameter
    Table = pd.concat([Table,temp],ignore_index=True)
    
    return Table

In [18]:
train_df = pd.read_csv('original_data_set/preprocessed_data.csv')
train_df.head()

Unnamed: 0,Item_Weight,Item_MRP,Outlet_Size,Outlet_Location_Type,Item_Outlet_Sales,Outlet_Age,Item_Fat_Content_regular,Item_Type_breads,Item_Type_breakfast,Item_Type_canned,Item_Type_dairy,Item_Type_frozen foods,Item_Type_fruits and vegetables,Item_Type_hard drinks,Item_Type_health and hygiene,Item_Type_household,Item_Type_meat,Item_Type_others,Item_Type_seafood,Item_Type_snack foods,Item_Type_soft drinks,Item_Type_starchy foods,Outlet_Type_supermarket type1,Outlet_Type_supermarket type2,Outlet_Type_supermarket type3
0,9.3,249.8092,1,2,11.794838,23,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,5.92,48.2692,1,0,6.223654,13,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,17.5,141.618,1,2,9.919649,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,19.2,182.095,0,0,7.234707,24,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.93,53.8614,2,0,7.930641,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Model Building

<ol>
    <li>Linear Regression</li>
    <li>Ridge Regression</li>
    <li>Lasso Regression</li>
    <li>Decision Tree Regressor</li>
    <li>Random Forest Regressor</li>
    <li>KNN Regressor</li>
    <li>Bagging</li>
    <li>Boosting</li>
    <li>Stacking</li>
</ol>

In [4]:
# Let's create a function to create adjusted R-Squared
def adj_R2(x,y,regression):
    r2 = regression.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [5]:
# Standarization only the numerical freature using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()

df_standard = minmax.fit_transform(train_df[['Item_Weight','Item_MRP','Outlet_Age']])

In [6]:
df_standard

array([[0.28252456, 0.92750715, 0.41666667],
       [0.08127419, 0.0720684 , 0.        ],
       [0.77076511, 0.46828841, 0.41666667],
       ...,
       [0.35992855, 0.22849221, 0.20833333],
       [0.15808276, 0.30493925, 0.        ],
       [0.61000298, 0.18750976, 0.5       ]])

In [7]:
train_df['Item_Weight'] = df_standard[:,0]
train_df['Item_MRP'] = df_standard[:,1]
train_df['Outlet_Age'] = df_standard[:,2]

train_df.head()

Unnamed: 0,Item_Weight,Item_MRP,Outlet_Size,Outlet_Location_Type,Item_Outlet_Sales,Outlet_Age,Item_Fat_Content_regular,Item_Type_breads,Item_Type_breakfast,Item_Type_canned,Item_Type_dairy,Item_Type_frozen foods,Item_Type_fruits and vegetables,Item_Type_hard drinks,Item_Type_health and hygiene,Item_Type_household,Item_Type_meat,Item_Type_others,Item_Type_seafood,Item_Type_snack foods,Item_Type_soft drinks,Item_Type_starchy foods,Outlet_Type_supermarket type1,Outlet_Type_supermarket type2,Outlet_Type_supermarket type3
0,0.282525,0.927507,1,2,11.794838,0.416667,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.081274,0.072068,1,0,6.223654,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.770765,0.468288,1,2,9.919649,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.871986,0.640093,0,0,7.234707,0.458333,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.260494,0.095805,2,0,7.930641,0.916667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## X, y Split

In [8]:
X = train_df.drop(['Item_Outlet_Sales'],axis = 1)
Y = train_df['Item_Outlet_Sales']

## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state = 42)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(6818, 24) (1705, 24) (6818,) (1705,)


In [10]:
X_train.head()

Unnamed: 0,Item_Weight,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Age,Item_Fat_Content_regular,Item_Type_breads,Item_Type_breakfast,Item_Type_canned,Item_Type_dairy,Item_Type_frozen foods,Item_Type_fruits and vegetables,Item_Type_hard drinks,Item_Type_health and hygiene,Item_Type_household,Item_Type_meat,Item_Type_others,Item_Type_seafood,Item_Type_snack foods,Item_Type_soft drinks,Item_Type_starchy foods,Outlet_Type_supermarket type1,Outlet_Type_supermarket type2,Outlet_Type_supermarket type3
549,0.294433,0.594464,1,2,0.416667,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7757,0.800536,0.591057,0,1,0.291667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
764,0.776719,0.341387,0,2,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6867,0.224472,0.043819,0,1,0.291667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2716,0.493897,0.527478,0,2,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
linear_reg.fit(X_train,Y_train)

LinearRegression()

In [12]:
print("R2 for train data: ",linear_reg.score(X_train,Y_train))
print("Adjusted R2 for train data: ",adj_R2(X_train,Y_train,linear_reg),'\n')

print("R2 for test data: ",linear_reg.score(X_test,Y_test))
print("Adjusted R2 for test data: ",adj_R2(X_test,Y_test,linear_reg))

R2 for train data:  0.6852926233835146
Adjusted R2 for train data:  0.6841807468872985 

R2 for test data:  0.7010591657640124
Adjusted R2 for test data:  0.696788582417784


In [14]:
#from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

model = LinearRegression()
scores_lr = cross_val_score(model, X=X_train , y=Y_train, cv=10)

print(scores_lr)
print(scores_lr.mean())

[0.67245647 0.70427537 0.68509999 0.66443675 0.69246958 0.66214653
 0.66762952 0.71363006 0.67419217 0.68085858]
0.6817195004951055


In [15]:
Table = insertData(Table, 'Linear Regression', adj_R2(X_train,Y_train,linear_reg), adj_R2(X_test,Y_test,linear_reg))
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default


## Lasso Regression

In [16]:
# Lasso Regularization
from sklearn.linear_model import LassoCV

lasscv = LassoCV(alphas=None,cv=10, max_iter=100000)
lasscv.fit(X_train,Y_train)

LassoCV(cv=10, max_iter=100000)

In [17]:
alpha = lasscv.alpha_
print(alpha)

0.002421773986147522


In [18]:
#now that we have best parameter, let's use Lasso regression and see how well our data has fitted before
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha)
lasso_reg.fit(X_train,Y_train)

Lasso(alpha=0.002421773986147522)

In [19]:
print("r2 for train data: ",lasso_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,lasso_reg),'\n')

print("r2 for test data: ",lasso_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,lasso_reg))

r2 for train data:  0.6850161188813926
adjusted_r2 for train data:  0.6839032654812974 

r2 for test data:  0.7010664233582903
adjusted_r2 for test data:  0.6967959436919802


In [20]:
Table = insertData(Table, 'Lasso Regression', adj_R2(X_train,Y_train,lasso_reg), adj_R2(X_test,Y_test,lasso_reg), [{'alpha=0.002421773986147522'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}


## Ridge Regression

In [29]:
# Ridge Regularization
from sklearn.linear_model import RidgeCV

# alphas = [0.01,0.1,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2]
alphas = [0.01, 0.1, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3, 3.1, 3.2]
ridgecv = RidgeCV(alphas, cv =10)
ridgecv.fit(X_train,Y_train)

RidgeCV(alphas=array([0.01, 0.1 , 1.  , 1.1 , 1.2 , 1.3 , 1.4 , 1.5 , 1.6 , 1.7 , 1.8 ,
       1.9 , 2.  , 2.1 , 2.2 , 2.3 , 2.4 , 2.5 , 2.6 , 2.7 , 2.8 , 2.9 ,
       3.  , 3.1 , 3.2 ]),
        cv=10)

In [30]:
alpha = ridgecv.alpha_
print(alpha)

1.0


In [31]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=alpha)
ridge_reg.fit(X_train,Y_train)

Ridge()

In [32]:
print("r2 for train data: ",ridge_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,ridge_reg),'\n')

print("r2 for test data: ",ridge_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,ridge_reg))

r2 for train data:  0.6852792412915403
adjusted_r2 for train data:  0.6841673175157412 

r2 for test data:  0.7010054461838371
adjusted_r2 for test data:  0.6967340954150347


In [33]:
Table = insertData(Table, 'Ridge Regression', adj_R2(X_train,Y_train,ridge_reg), adj_R2(X_test,Y_test,ridge_reg), [{'alpha = 1.0'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}


## Decision Tree Regressor

In [34]:
HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
                   'max_depth':[8,10,12,14,16,18],
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10]}

In [45]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

decision_tree_reg = GridSearchCV(DecisionTreeRegressor(), param_grid = HyperParameters, cv = 5, n_jobs = -1, verbose = 3)

decision_tree_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 810 candidates, totalling 4050 fits


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': [8, 10, 12, 14, 16, 18],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18,
                                               20, 22, 24, 26, 28, 30]},
             verbose=3)

In [46]:
print(decision_tree_reg.best_score_)
print(decision_tree_reg.best_params_)

0.6792281719446445
{'max_depth': 8, 'min_samples_leaf': 9, 'min_samples_split': 30}


In [47]:
dt_reg = DecisionTreeRegressor(criterion='mse', max_depth=8, min_samples_leaf=9, min_samples_split=30)
dt_reg.fit(X_train,Y_train)

DecisionTreeRegressor(max_depth=8, min_samples_leaf=9, min_samples_split=30)

In [48]:
print("r2 for train data: ",dt_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,dt_reg),'\n')

print("r2 for test data: ",dt_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,dt_reg))

r2 for train data:  0.7262002988237294
adjusted_r2 for train data:  0.7252329511381368 

r2 for test data:  0.6948204327124701
adjusted_r2 for test data:  0.6904607246083625


In [49]:
## Cross Validation
scores_lm = cross_val_score(dt_reg, X = X_train, y = Y_train, cv = 10, scoring='r2')

print(scores_lm)
print(scores_lm.mean())

[0.66790327 0.70605272 0.68074118 0.65766022 0.68741728 0.64990071
 0.67345449 0.71035618 0.67756406 0.67801629]
0.6789066414015178


In [50]:
Table = insertData(Table, 'Decision Tree Regressor', adj_R2(X_train,Y_train,dt_reg), adj_R2(X_test,Y_test,dt_reg), [{'max_depth=8, min_samples_leaf=9, min_samples_split=30'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."


## Random Forest Regressor

In [51]:
HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
                   'max_depth':[8,10,12,14,16,18],
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10],
                   'max_features':["auto", "sqrt", "log2"]}

In [52]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

random_forest_reg = RandomizedSearchCV(RandomForestRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

random_forest_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': [8, 10, 12, 14, 16, 18],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 3, 4, 5, 6, 7,
                                                             8, 9, 10],
                                        'min_samples_split': [2, 4, 6, 8, 10,
                                                              12, 14, 16, 18,
                                                              20, 22, 24, 26,
                                                              28, 30]},
                   random_state=42, verbose=3)

In [53]:
print(random_forest_reg.best_score_)
print(random_forest_reg.best_params_)

0.6950450172375143
{'min_samples_split': 30, 'min_samples_leaf': 10, 'max_features': 'auto', 'max_depth': 8}


In [54]:
rf_reg = DecisionTreeRegressor(min_samples_split=30, min_samples_leaf=10, max_features='auto', max_depth=8)
rf_reg.fit(X_train,Y_train)

DecisionTreeRegressor(max_depth=8, max_features='auto', min_samples_leaf=10,
                      min_samples_split=30)

In [55]:
print("r2 for train data: ",rf_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,rf_reg),'\n')

print("r2 for test data: ",rf_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,rf_reg))

r2 for train data:  0.7258037308715828
adjusted_r2 for train data:  0.7248349820920919 

r2 for test data:  0.6954559941247114
adjusted_r2 for test data:  0.6911053654693501


In [56]:
## Cross Validation
scores_lm = cross_val_score(rf_reg, X = X_train, y = Y_train, cv = 10, scoring='r2')

print(scores_lm)
print(scores_lm.mean())

[0.66930499 0.70679349 0.67983731 0.65946438 0.69095402 0.65313889
 0.67119077 0.71129357 0.67712072 0.68129802]
0.6800396159989323


In [57]:
Table = insertData(Table,'Random Forest Regressor',adj_R2(X_train,Y_train,rf_reg),adj_R2(X_test,Y_test,rf_reg),[{'min_samples_split=30, min_samples_leaf=10, max_features=auto, max_depth=8'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."
4,Random Forest Regressor,0.724835,0.691105,"{min_samples_split=30, min_samples_leaf=10, ma..."


## KNN Regressor

In [58]:
HyperParameters = {'weights':['uniform','distance'],
                   'algorithm':['auto','ball_tree','kd_tree','brute'],
                   'leaf_size':[20,25,30,35,40,45,50]}

# HyperParameters = {'weights':['uniform','distance'],
#                    'algorithm':['ball_tree','kd_tree'],
#                    'leaf_size':[20,25,30,35,40,45,50]}

In [59]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV

knn_reg = RandomizedSearchCV(KNeighborsRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

knn_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


RandomizedSearchCV(cv=5, estimator=KNeighborsRegressor(), n_iter=100, n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'leaf_size': [20, 25, 30, 35, 40, 45,
                                                      50],
                                        'weights': ['uniform', 'distance']},
                   random_state=42, verbose=3)

In [60]:
print(knn_reg.best_score_)
print(knn_reg.best_params_)

0.5924506699152486
{'weights': 'uniform', 'leaf_size': 20, 'algorithm': 'auto'}


In [61]:
knn_reg = KNeighborsRegressor(weights='uniform', algorithm='auto',leaf_size=20)
knn_reg.fit(X_train,Y_train)

KNeighborsRegressor(leaf_size=20)

In [62]:
print("r2 for train data: ",knn_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,knn_reg),'\n')

print("r2 for test data: ",knn_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,knn_reg))

r2 for train data:  0.7380307880480925
adjusted_r2 for train data:  0.7371052380573895 

r2 for test data:  0.6250283972503565
adjusted_r2 for test data:  0.6196716600682187


In [63]:
Table = insertData(Table,'KNN Regressor',adj_R2(X_train,Y_train,knn_reg),adj_R2(X_test,Y_test,knn_reg),[{'weights=uniform, algorithm=auto, leaf_size=20'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."
4,Random Forest Regressor,0.724835,0.691105,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.737105,0.619672,"{weights=uniform, algorithm=auto, leaf_size=20}"


## Bagging Regressor

In [64]:
# Linear Regression
from sklearn.ensemble import BaggingRegressor

lr_bagging = BaggingRegressor(base_estimator=LinearRegression(), n_estimators=100, bootstrap=True,verbose=2,n_jobs=-1)
lr_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.7s finished


BaggingRegressor(base_estimator=LinearRegression(), n_estimators=100, n_jobs=-1,
                 verbose=2)

In [65]:
print("r2 for train data: ",lr_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,lr_bagging),'\n')

print("r2 for test data: ",lr_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,lr_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for train data:  0.6852816539269742
adjusted_r2 for train data:  0.6841697386751336 

r2 for test data:  0.7010578578988098
adjusted_r2 for test data:  0.6967872558687929


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


In [66]:
Table = insertData(Table,'Bag Linear Regression',adj_R2(X_train,Y_train,lr_bagging),adj_R2(X_test,Y_test,lr_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."
4,Random Forest Regressor,0.724835,0.691105,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.737105,0.619672,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.68417,0.696787,default


In [67]:
# Decision Tree Regressor
from sklearn.ensemble import BaggingRegressor

dt_bagging = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100, bootstrap=True,verbose=2, n_jobs=-1)
dt_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.4s finished


BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100,
                 n_jobs=-1, verbose=2)

In [68]:
print("r2 for train data: ",dt_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,dt_bagging),'\n')

print("r2 for test data: ",dt_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,dt_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for train data:  0.9514019790763765


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


adjusted_r2 for train data:  0.9512302799004355 



[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for test data:  0.6697299195744597
adjusted_r2 for test data:  0.6650117755683805


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished


In [69]:
Table = insertData(Table,'Bag Decision Tree Regressor',adj_R2(X_train,Y_train,dt_bagging),adj_R2(X_test,Y_test,dt_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished


Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."
4,Random Forest Regressor,0.724835,0.691105,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.737105,0.619672,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.68417,0.696787,default
7,Bag Decision Tree Regressor,0.95123,0.665012,default


In [116]:
from sklearn.svm import SVR

In [117]:
# Decision Tree Regressor
from sklearn.ensemble import BaggingRegressor

svr_bagging = BaggingRegressor(base_estimator=SVR(), n_estimators=100, bootstrap=True,verbose=2)
svr_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Building estimator 1 of 100 for this parallel run (total 100)...
Building estimator 2 of 100 for this parallel run (total 100)...
Building estimator 3 of 100 for this parallel run (total 100)...
Building estimator 4 of 100 for this parallel run (total 100)...
Building estimator 5 of 100 for this parallel run (total 100)...
Building estimator 6 of 100 for this parallel run (total 100)...
Building estimator 7 of 100 for this parallel run (total 100)...
Building estimator 8 of 100 for this parallel run (total 100)...
Building estimator 9 of 100 for this parallel run (total 100)...
Building estimator 10 of 100 for this parallel run (total 100)...
Building estimator 11 of 100 for this parallel run (total 100)...
Building estimator 12 of 100 for this parallel run (total 100)...
Building estimator 13 of 100 for this parallel run (total 100)...
Building estimator 14 of 100 for this parallel run (total 100)...
Building estimator 15 of 100 for this parallel run (total 100)...
Building estimator 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min finished


BaggingRegressor(base_estimator=SVR(), n_estimators=100, verbose=2)

In [118]:
print("r2 for train data: ",svr_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,svr_bagging),'\n')

print("r2 for test data: ",svr_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,svr_bagging))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


r2 for train data:  -0.007774727114581603


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


adjusted_r2 for train data:  -0.01118638721520493 



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


r2 for test data:  0.01819860776469384
adjusted_r2 for test data:  0.0047652752118014785


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min finished


In [70]:
# KNN Regressor
from sklearn.ensemble import BaggingRegressor

knn_bagging = BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100, bootstrap=True,verbose=2,n_jobs=-1)
knn_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.6s finished


BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100,
                 n_jobs=-1, verbose=2)

In [71]:
print("r2 for train data: ",knn_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,knn_bagging),'\n')

print("r2 for test data: ",knn_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,knn_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   42.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for train data:  0.7524607153772473


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   45.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


adjusted_r2 for train data:  0.7515861470229199 



[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for test data:  0.6411040722782019
adjusted_r2 for test data:  0.6359769875964619


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   11.5s finished


In [72]:
Table = insertData(Table,'Bag KNN Regressor',adj_R2(X_train,Y_train,knn_bagging),adj_R2(X_test,Y_test,knn_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   46.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.9s finished


Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."
4,Random Forest Regressor,0.724835,0.691105,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.737105,0.619672,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.68417,0.696787,default
7,Bag Decision Tree Regressor,0.95123,0.665012,default
8,Bag KNN Regressor,0.751586,0.635977,default


## Gradient Boosting

In [73]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boost_reg = GradientBoostingRegressor(random_state=42)
gradient_boost_reg.fit(X_train,Y_train)

GradientBoostingRegressor(random_state=42)

In [74]:
print("r2 for train data: ",gradient_boost_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,gradient_boost_reg),'\n')

print("r2 for test data: ",gradient_boost_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,gradient_boost_reg))

r2 for train data:  0.7203810320369426
adjusted_r2 for train data:  0.719393124598239 

r2 for test data:  0.7123295783486463
adjusted_r2 for test data:  0.7082200008964841


In [75]:
Table = insertData(Table,'Gradient Boosting',adj_R2(X_train,Y_train,gradient_boost_reg),adj_R2(X_test,Y_test,gradient_boost_reg))
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."
4,Random Forest Regressor,0.724835,0.691105,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.737105,0.619672,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.68417,0.696787,default
7,Bag Decision Tree Regressor,0.95123,0.665012,default
8,Bag KNN Regressor,0.751586,0.635977,default
9,Gradient Boosting,0.719393,0.70822,default


In [76]:
HyperParameters = {'learning_rate':[0.01,0.02,0.03,0.4,0.05,0.06,0.1],
                   'n_estimators':[50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
                   'max_leaf_nodes':[2, 5, 10, 20, 50, 100],
                   'max_depth':[8,10,12,14,16,18]
                  }

# HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
#                    'max_depth':[8,10,12,14,16,18],
#                    }

# param_distributions = {
#     "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
#     "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
#     "learning_rate": loguniform(0.01, 1),
# }

In [77]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

gradient_boost_reg = RandomizedSearchCV(GradientBoostingRegressor(random_state=0), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

gradient_boost_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=0),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'learning_rate': [0.01, 0.02, 0.03, 0.4,
                                                          0.05, 0.06, 0.1],
                                        'max_depth': [8, 10, 12, 14, 16, 18],
                                        'max_leaf_nodes': [2, 5, 10, 20, 50,
                                                           100],
                                        'n_estimators': [50, 100, 150, 200, 250,
                                                         300, 350, 400, 450,
                                                         500]},
                   random_state=42, verbose=3)

In [78]:
print(gradient_boost_reg.best_score_)
print(gradient_boost_reg.best_params_)

0.698677355169951
{'n_estimators': 100, 'max_leaf_nodes': 10, 'max_depth': 18, 'learning_rate': 0.05}


In [79]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boost_reg = GradientBoostingRegressor(n_estimators=100, max_leaf_nodes=10, max_depth=18, learning_rate=0.05)
gradient_boost_reg.fit(X_train,Y_train)

GradientBoostingRegressor(learning_rate=0.05, max_depth=18, max_leaf_nodes=10)

In [80]:
print("r2 for train data: ",gradient_boost_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,gradient_boost_reg),'\n')

print("r2 for test data: ",gradient_boost_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,gradient_boost_reg))

r2 for train data:  0.7147174607412863
adjusted_r2 for train data:  0.7137095436292284 

r2 for test data:  0.714764189579383
adjusted_r2 for test data:  0.7106893922876599


In [81]:
Table = insertData(Table,'Gradient Boosting_HyperParameters',adj_R2(X_train,Y_train,gradient_boost_reg),adj_R2(X_test,Y_test,gradient_boost_reg),[{'n_estimators=50, max_leaf_nodes=20, max_depth=8, learning_rate=0.1'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."
4,Random Forest Regressor,0.724835,0.691105,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.737105,0.619672,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.68417,0.696787,default
7,Bag Decision Tree Regressor,0.95123,0.665012,default
8,Bag KNN Regressor,0.751586,0.635977,default
9,Gradient Boosting,0.719393,0.70822,default


## XGBRegressor

In [82]:
from xgboost import XGBRegressor

XGB_reg=XGBRegressor(objective ='reg:linear',n_estimators = 10, seed = 42)
XGB_reg.fit(X_train,Y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=10, n_jobs=4,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [83]:
print("r2 for train data: ",XGB_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,XGB_reg),'\n')

print("r2 for test data: ",XGB_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,XGB_reg))

r2 for train data:  0.7225885908333601
adjusted_r2 for train data:  0.7216084828074512 

r2 for test data:  0.7021995487698147
adjusted_r2 for test data:  0.6979452566093836


In [84]:
Table = insertData(Table,'XGBRegressor',adj_R2(X_train,Y_train,XGB_reg),adj_R2(X_test,Y_test,XGB_reg))
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."
4,Random Forest Regressor,0.724835,0.691105,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.737105,0.619672,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.68417,0.696787,default
7,Bag Decision Tree Regressor,0.95123,0.665012,default
8,Bag KNN Regressor,0.751586,0.635977,default
9,Gradient Boosting,0.719393,0.70822,default


In [85]:
HyperParameters = {'n_estimators': [15,17,18,19,20,22,25,26,27,28,29,30],
                   'booster': ['gbtree'],
                   'max_depth': [3,4,5,6],
                   'min_child_weight': [74,75,78,79,80,81]
                  }

In [86]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

XGB_reg = RandomizedSearchCV(XGBRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

XGB_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=...
                                          predictor=None, random_state=None,
                                          reg_alpha=None, reg_lambda=None,
                                       

In [87]:
print(XGB_reg.best_score_)
print(XGB_reg.best_params_)

0.6969120849803735
{'n_estimators': 15, 'min_child_weight': 74, 'max_depth': 5, 'booster': 'gbtree'}


In [88]:
from xgboost import XGBRegressor

XGB_reg=XGBRegressor(objective ='reg:linear',n_estimators = 15, min_child_weight=74, max_depth=5, booster='gbtree',seed = 42)
XGB_reg.fit(X_train,Y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=5, min_child_weight=74, missing=nan,
             monotone_constraints='()', n_estimators=15, n_jobs=4,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [89]:
print("r2 for train data: ",XGB_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,XGB_reg),'\n')

print("r2 for test data: ",XGB_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,XGB_reg))

r2 for train data:  0.7174652810768977
adjusted_r2 for train data:  0.7164670721479776 

r2 for test data:  0.7133100151084982
adjusted_r2 for test data:  0.7092144438957625


In [90]:
Table = insertData(Table,'XGBRegressor',adj_R2(X_train,Y_train,XGB_reg),adj_R2(X_test,Y_test,XGB_reg),[{'objective=reg:linear,n_estimators = 15, min_child_weight=74, max_depth=5, booster=gbtree, seed = 42'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."
4,Random Forest Regressor,0.724835,0.691105,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.737105,0.619672,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.68417,0.696787,default
7,Bag Decision Tree Regressor,0.95123,0.665012,default
8,Bag KNN Regressor,0.751586,0.635977,default
9,Gradient Boosting,0.719393,0.70822,default


## Cat Boosting

In [91]:
from catboost import CatBoostRegressor

cat_reg = CatBoostRegressor()
cat_reg.fit(X_train,Y_train)

Learning rate set to 0.055449
0:	learn: 2.4380700	total: 191ms	remaining: 3m 10s
1:	learn: 2.3579602	total: 193ms	remaining: 1m 36s
2:	learn: 2.2851289	total: 195ms	remaining: 1m 4s
3:	learn: 2.2127791	total: 197ms	remaining: 49.1s
4:	learn: 2.1425734	total: 199ms	remaining: 39.7s
5:	learn: 2.0775284	total: 202ms	remaining: 33.4s
6:	learn: 2.0196424	total: 204ms	remaining: 28.9s
7:	learn: 1.9672450	total: 206ms	remaining: 25.6s
8:	learn: 1.9188899	total: 209ms	remaining: 23s
9:	learn: 1.8726386	total: 211ms	remaining: 20.9s
10:	learn: 1.8293327	total: 213ms	remaining: 19.2s
11:	learn: 1.7944759	total: 215ms	remaining: 17.7s
12:	learn: 1.7603179	total: 218ms	remaining: 16.5s
13:	learn: 1.7254425	total: 220ms	remaining: 15.5s
14:	learn: 1.6932551	total: 223ms	remaining: 14.6s
15:	learn: 1.6656431	total: 225ms	remaining: 13.8s
16:	learn: 1.6391298	total: 227ms	remaining: 13.1s
17:	learn: 1.6149271	total: 229ms	remaining: 12.5s
18:	learn: 1.6002447	total: 231ms	remaining: 11.9s
19:	learn: 

227:	learn: 1.3204227	total: 724ms	remaining: 2.45s
228:	learn: 1.3201757	total: 726ms	remaining: 2.44s
229:	learn: 1.3198514	total: 729ms	remaining: 2.44s
230:	learn: 1.3196096	total: 732ms	remaining: 2.44s
231:	learn: 1.3193705	total: 734ms	remaining: 2.43s
232:	learn: 1.3189176	total: 737ms	remaining: 2.43s
233:	learn: 1.3186327	total: 740ms	remaining: 2.42s
234:	learn: 1.3182162	total: 743ms	remaining: 2.42s
235:	learn: 1.3178507	total: 746ms	remaining: 2.41s
236:	learn: 1.3177079	total: 749ms	remaining: 2.41s
237:	learn: 1.3175083	total: 751ms	remaining: 2.4s
238:	learn: 1.3172717	total: 753ms	remaining: 2.4s
239:	learn: 1.3169166	total: 756ms	remaining: 2.39s
240:	learn: 1.3166142	total: 758ms	remaining: 2.39s
241:	learn: 1.3161700	total: 760ms	remaining: 2.38s
242:	learn: 1.3158435	total: 763ms	remaining: 2.38s
243:	learn: 1.3154937	total: 765ms	remaining: 2.37s
244:	learn: 1.3150947	total: 767ms	remaining: 2.36s
245:	learn: 1.3148521	total: 770ms	remaining: 2.36s
246:	learn: 1.

440:	learn: 1.2657490	total: 1.24s	remaining: 1.57s
441:	learn: 1.2655195	total: 1.25s	remaining: 1.57s
442:	learn: 1.2652388	total: 1.25s	remaining: 1.57s
443:	learn: 1.2649967	total: 1.25s	remaining: 1.57s
444:	learn: 1.2649060	total: 1.25s	remaining: 1.56s
445:	learn: 1.2647276	total: 1.25s	remaining: 1.56s
446:	learn: 1.2644800	total: 1.26s	remaining: 1.56s
447:	learn: 1.2642254	total: 1.26s	remaining: 1.55s
448:	learn: 1.2639846	total: 1.26s	remaining: 1.55s
449:	learn: 1.2638539	total: 1.27s	remaining: 1.55s
450:	learn: 1.2636421	total: 1.27s	remaining: 1.54s
451:	learn: 1.2633617	total: 1.27s	remaining: 1.54s
452:	learn: 1.2631091	total: 1.27s	remaining: 1.54s
453:	learn: 1.2628304	total: 1.28s	remaining: 1.54s
454:	learn: 1.2626953	total: 1.28s	remaining: 1.53s
455:	learn: 1.2625617	total: 1.28s	remaining: 1.53s
456:	learn: 1.2622858	total: 1.29s	remaining: 1.53s
457:	learn: 1.2621296	total: 1.29s	remaining: 1.52s
458:	learn: 1.2618836	total: 1.29s	remaining: 1.52s
459:	learn: 

636:	learn: 1.2256070	total: 1.76s	remaining: 1s
637:	learn: 1.2252405	total: 1.76s	remaining: 1s
638:	learn: 1.2250752	total: 1.77s	remaining: 999ms
639:	learn: 1.2249254	total: 1.77s	remaining: 996ms
640:	learn: 1.2247993	total: 1.77s	remaining: 993ms
641:	learn: 1.2246385	total: 1.78s	remaining: 990ms
642:	learn: 1.2244626	total: 1.78s	remaining: 988ms
643:	learn: 1.2242844	total: 1.78s	remaining: 985ms
644:	learn: 1.2241307	total: 1.78s	remaining: 982ms
645:	learn: 1.2238884	total: 1.79s	remaining: 979ms
646:	learn: 1.2237603	total: 1.79s	remaining: 978ms
647:	learn: 1.2235583	total: 1.79s	remaining: 975ms
648:	learn: 1.2234808	total: 1.8s	remaining: 972ms
649:	learn: 1.2232902	total: 1.8s	remaining: 970ms
650:	learn: 1.2230712	total: 1.8s	remaining: 967ms
651:	learn: 1.2228052	total: 1.81s	remaining: 965ms
652:	learn: 1.2226112	total: 1.81s	remaining: 962ms
653:	learn: 1.2223359	total: 1.81s	remaining: 959ms
654:	learn: 1.2222041	total: 1.81s	remaining: 956ms
655:	learn: 1.2220534

847:	learn: 1.1889909	total: 2.29s	remaining: 410ms
848:	learn: 1.1889206	total: 2.29s	remaining: 408ms
849:	learn: 1.1888000	total: 2.29s	remaining: 405ms
850:	learn: 1.1885054	total: 2.3s	remaining: 402ms
851:	learn: 1.1883030	total: 2.3s	remaining: 400ms
852:	learn: 1.1881707	total: 2.3s	remaining: 397ms
853:	learn: 1.1880037	total: 2.31s	remaining: 394ms
854:	learn: 1.1878333	total: 2.31s	remaining: 392ms
855:	learn: 1.1877238	total: 2.31s	remaining: 389ms
856:	learn: 1.1875759	total: 2.31s	remaining: 386ms
857:	learn: 1.1874784	total: 2.33s	remaining: 385ms
858:	learn: 1.1873169	total: 2.34s	remaining: 384ms
859:	learn: 1.1870713	total: 2.35s	remaining: 383ms
860:	learn: 1.1869829	total: 2.36s	remaining: 381ms
861:	learn: 1.1867791	total: 2.36s	remaining: 378ms
862:	learn: 1.1866640	total: 2.37s	remaining: 376ms
863:	learn: 1.1865976	total: 2.37s	remaining: 373ms
864:	learn: 1.1864304	total: 2.38s	remaining: 371ms
865:	learn: 1.1862918	total: 2.38s	remaining: 368ms
866:	learn: 1.1

<catboost.core.CatBoostRegressor at 0x22f8005edf0>

In [92]:
print("r2 for train data: ",cat_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,cat_reg),'\n')

print("r2 for test data: ",cat_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,cat_reg))

r2 for train data:  0.7884397676854085
adjusted_r2 for train data:  0.7876923150760238 

r2 for test data:  0.7010303909613225
adjusted_r2 for test data:  0.6967593965464842


In [93]:
HyperParameters = {'depth':[2,3,4,5,6,7],
                   'iterations':[145,150,155,156,157],
                   'learning_rate':[0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6]}

In [94]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

cat_reg = RandomizedSearchCV(CatBoostRegressor(), param_distributions=HyperParameters, n_iter=100, cv=5, n_jobs=-1, verbose=2, 
                            return_train_score=False)

cat_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
0:	learn: 2.4499716	total: 25.4ms	remaining: 3.65s
1:	learn: 2.3737849	total: 34.1ms	remaining: 2.44s
2:	learn: 2.3076276	total: 41.7ms	remaining: 1.97s
3:	learn: 2.2389394	total: 49.1ms	remaining: 1.73s
4:	learn: 2.1757221	total: 56.5ms	remaining: 1.58s
5:	learn: 2.1177500	total: 63.9ms	remaining: 1.48s
6:	learn: 2.0629468	total: 68.8ms	remaining: 1.36s
7:	learn: 2.0125758	total: 72.5ms	remaining: 1.24s
8:	learn: 1.9663182	total: 76.1ms	remaining: 1.15s
9:	learn: 1.9210421	total: 79ms	remaining: 1.07s
10:	learn: 1.8801132	total: 81.9ms	remaining: 998ms
11:	learn: 1.8428281	total: 84.4ms	remaining: 935ms
12:	learn: 1.8065207	total: 86.9ms	remaining: 882ms
13:	learn: 1.7729572	total: 89.2ms	remaining: 835ms
14:	learn: 1.7425787	total: 91.3ms	remaining: 791ms
15:	learn: 1.7170531	total: 93.3ms	remaining: 752ms
16:	learn: 1.6898885	total: 95.4ms	remaining: 718ms
17:	learn: 1.6661080	total: 97.2ms	remaining: 686ms
18:	learn: 1.

RandomizedSearchCV(cv=5,
                   estimator=<catboost.core.CatBoostRegressor object at 0x0000022F80102250>,
                   n_iter=100, n_jobs=-1,
                   param_distributions={'depth': [2, 3, 4, 5, 6, 7],
                                        'iterations': [145, 150, 155, 156, 157],
                                        'learning_rate': [0.01, 0.05, 0.1, 0.2,
                                                          0.3, 0.4, 0.5, 0.6]},
                   verbose=2)

In [95]:
print(cat_reg.best_score_)
print(cat_reg.best_params_)

0.7009537178642946
{'learning_rate': 0.05, 'iterations': 145, 'depth': 5}


In [13]:
from catboost import CatBoostRegressor

cat_reg = CatBoostRegressor(learning_rate=0.05, iterations=145, depth=5)
cat_reg.fit(X_train,Y_train)

0:	learn: 2.4499716	total: 145ms	remaining: 20.8s
1:	learn: 2.3737849	total: 155ms	remaining: 11.1s
2:	learn: 2.3076276	total: 164ms	remaining: 7.74s
3:	learn: 2.2389394	total: 173ms	remaining: 6.11s
4:	learn: 2.1757221	total: 182ms	remaining: 5.1s
5:	learn: 2.1177500	total: 191ms	remaining: 4.42s
6:	learn: 2.0629468	total: 199ms	remaining: 3.92s
7:	learn: 2.0125758	total: 206ms	remaining: 3.53s
8:	learn: 1.9663182	total: 214ms	remaining: 3.23s
9:	learn: 1.9210421	total: 222ms	remaining: 3s
10:	learn: 1.8801132	total: 230ms	remaining: 2.8s
11:	learn: 1.8428281	total: 238ms	remaining: 2.63s
12:	learn: 1.8065207	total: 246ms	remaining: 2.49s
13:	learn: 1.7729572	total: 254ms	remaining: 2.37s
14:	learn: 1.7425787	total: 262ms	remaining: 2.27s
15:	learn: 1.7170531	total: 267ms	remaining: 2.15s
16:	learn: 1.6898885	total: 271ms	remaining: 2.04s
17:	learn: 1.6661080	total: 275ms	remaining: 1.94s
18:	learn: 1.6425784	total: 278ms	remaining: 1.85s
19:	learn: 1.6207731	total: 281ms	remaining: 1

<catboost.core.CatBoostRegressor at 0x2c0df9b7580>

In [97]:
print("r2 for train data: ",cat_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,cat_reg),'\n')

print("r2 for test data: ",cat_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,cat_reg))

r2 for train data:  0.7133008715467718
adjusted_r2 for train data:  0.7122879495560641 

r2 for test data:  0.7172647852164244
adjusted_r2 for test data:  0.7132257107195162


In [98]:
Table = insertData(Table,'Cat Boost Regressor',adj_R2(X_train,Y_train,cat_reg),adj_R2(X_test,Y_test,cat_reg),[{'learning_rate=0.05, iterations=145, depth=5'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.684181,0.696789,default
1,Lasso Regression,0.683903,0.696796,{alpha=0.002421773986147522}
2,Ridge Regression,0.684167,0.696734,{alpha = 1.0}
3,Decision Tree Regressor,0.725233,0.690461,"{max_depth=8, min_samples_leaf=9, min_samples_..."
4,Random Forest Regressor,0.724835,0.691105,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.737105,0.619672,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.68417,0.696787,default
7,Bag Decision Tree Regressor,0.95123,0.665012,default
8,Bag KNN Regressor,0.751586,0.635977,default
9,Gradient Boosting,0.719393,0.70822,default


## Save required models in pickle files

In [94]:
# Table.to_csv("original_data_set/Result.csv",index = False)

In [16]:
# import pickle
# file = open('models/catboost_model.pickle','wb')

# pickle.dump(cat_reg, file)
# file.close()

In [97]:
# import pickle
# file = open('models/Gradientboost_model.pickle','wb')

# pickle.dump(gradient_boost_reg, file)
# file.close()

In [11]:
# import pickle
# file = open('models/minmax_transformation.pickle','wb')

# pickle.dump(minmax, file)
# file.close()