In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
## To see all columns of the dataset

pd.set_option("display.max_columns", None)

In [3]:
Table = pd.DataFrame(columns=['Model_Name','Adjusted_R2_Train','Adjusted_R2_Test','Parameters'])
def insertData(Table, model_name, train_R2, test_R2, parameter='default'):
    temp = pd.DataFrame()
    temp['Model_Name'] = [model_name]
    temp['Adjusted_R2_Train'] = train_R2
    temp['Adjusted_R2_Test'] = test_R2
    temp['Parameters'] = parameter
    Table = pd.concat([Table,temp],ignore_index=True)
    
    return Table

In [4]:
train_df = pd.read_csv('Data_Set/preprocessed_data_v1_p5.csv')
train_df.head()

Unnamed: 0,Item_Weight,Item_MRP,Outlet_Size,Outlet_Location_Type,Item_Outlet_Sales,Outlet_Age,Item_Fat_Content_regular,Item_Type_breads,Item_Type_breakfast,Item_Type_canned,Item_Type_dairy,Item_Type_frozen foods,Item_Type_fruits and vegetables,Item_Type_hard drinks,Item_Type_health and hygiene,Item_Type_household,Item_Type_meat,Item_Type_others,Item_Type_seafood,Item_Type_snack foods,Item_Type_soft drinks,Item_Type_starchy foods,Outlet_Type_supermarket type1,Outlet_Type_supermarket type2,Outlet_Type_supermarket type3
0,9.3,249.8092,1,2,3735.138,23,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,5.92,48.2692,1,0,443.4228,13,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,17.5,141.618,1,2,2097.27,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,19.2,182.095,0,0,732.38,24,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8.93,53.8614,2,0,994.7052,35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# Model Building

<ol>
    <li>Linear Regression</li>
    <li>Ridge Regression</li>
    <li>Lasso Regression</li>
    <li>Decision Tree Regressor</li>
    <li>Random Forest Regressor</li>
    <li>KNN Regressor</li>
    <li>Bagging</li>
    <li>Boosting</li>
    <li>Stacking</li>
</ol>

In [5]:
# Let's create a function to create adjusted R-Squared
def adj_R2(x,y,regression):
    r2 = regression.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [6]:
# Standarization only the numerical freature using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()

df_standard = minmax.fit_transform(train_df[['Item_Weight','Item_MRP','Outlet_Age']])

In [7]:
df_standard

array([[0.28252456, 0.92750715, 0.41666667],
       [0.08127419, 0.0720684 , 0.        ],
       [0.77076511, 0.46828841, 0.41666667],
       ...,
       [0.35992855, 0.22849221, 0.20833333],
       [0.15808276, 0.30493925, 0.        ],
       [0.61000298, 0.18750976, 0.5       ]])

In [8]:
train_df['Item_Weight'] = df_standard[:,0]
train_df['Item_MRP'] = df_standard[:,1]
train_df['Outlet_Age'] = df_standard[:,2]

train_df.head()

Unnamed: 0,Item_Weight,Item_MRP,Outlet_Size,Outlet_Location_Type,Item_Outlet_Sales,Outlet_Age,Item_Fat_Content_regular,Item_Type_breads,Item_Type_breakfast,Item_Type_canned,Item_Type_dairy,Item_Type_frozen foods,Item_Type_fruits and vegetables,Item_Type_hard drinks,Item_Type_health and hygiene,Item_Type_household,Item_Type_meat,Item_Type_others,Item_Type_seafood,Item_Type_snack foods,Item_Type_soft drinks,Item_Type_starchy foods,Outlet_Type_supermarket type1,Outlet_Type_supermarket type2,Outlet_Type_supermarket type3
0,0.282525,0.927507,1,2,3735.138,0.416667,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.081274,0.072068,1,0,443.4228,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.770765,0.468288,1,2,2097.27,0.416667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.871986,0.640093,0,0,732.38,0.458333,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.260494,0.095805,2,0,994.7052,0.916667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


## X, y Split

In [9]:
X = train_df.drop(['Item_Outlet_Sales'],axis = 1)
Y = train_df['Item_Outlet_Sales']

## Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state = 42)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(6818, 24) (1705, 24) (6818,) (1705,)


In [11]:
X_train.head()

Unnamed: 0,Item_Weight,Item_MRP,Outlet_Size,Outlet_Location_Type,Outlet_Age,Item_Fat_Content_regular,Item_Type_breads,Item_Type_breakfast,Item_Type_canned,Item_Type_dairy,Item_Type_frozen foods,Item_Type_fruits and vegetables,Item_Type_hard drinks,Item_Type_health and hygiene,Item_Type_household,Item_Type_meat,Item_Type_others,Item_Type_seafood,Item_Type_snack foods,Item_Type_soft drinks,Item_Type_starchy foods,Outlet_Type_supermarket type1,Outlet_Type_supermarket type2,Outlet_Type_supermarket type3
549,0.294433,0.594464,1,2,0.416667,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7757,0.800536,0.591057,0,1,0.291667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
764,0.776719,0.341387,0,2,0.5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6867,0.224472,0.043819,0,1,0.291667,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2716,0.493897,0.527478,0,2,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0


## Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
linear_reg.fit(X_train,Y_train)

LinearRegression()

In [13]:
print("R2 for train data: ",linear_reg.score(X_train,Y_train))
print("Adjusted R2 for train data: ",adj_R2(X_train,Y_train,linear_reg),'\n')

print("R2 for test data: ",linear_reg.score(X_test,Y_test))
print("Adjusted R2 for test data: ",adj_R2(X_test,Y_test,linear_reg))

R2 for train data:  0.5585930934721199
Adjusted R2 for train data:  0.5570335813630858 

R2 for test data:  0.5786372566489811
Adjusted R2 for test data:  0.5726177888868237


In [14]:
#from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

model = LinearRegression()
scores_lr = cross_val_score(model, X=X_train , y=Y_train, cv=10)

print(scores_lr)
print(scores_lr.mean())

[0.53201443 0.5836182  0.56473095 0.55294255 0.57324926 0.53112289
 0.54116949 0.57834649 0.541208   0.54365657]
0.5542058832955463


In [15]:
Table = insertData(Table, 'Linear Regression', adj_R2(X_train,Y_train,linear_reg), adj_R2(X_test,Y_test,linear_reg))
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default


## Lasso Regression

In [16]:
# Lasso Regularization
from sklearn.linear_model import LassoCV

lasscv = LassoCV(alphas=None,cv=10, max_iter=100000)
lasscv.fit(X_train,Y_train)

LassoCV(cv=10, max_iter=100000)

In [17]:
alpha = lasscv.alpha_
print(alpha)

2.4070688744491906


In [18]:
#now that we have best parameter, let's use Lasso regression and see how well our data has fitted before
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha)
lasso_reg.fit(X_train,Y_train)

Lasso(alpha=2.4070688744491906)

In [19]:
print("r2 for train data: ",lasso_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,lasso_reg),'\n')

print("r2 for test data: ",lasso_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,lasso_reg))

r2 for train data:  0.5580776570299448
adjusted_r2 for train data:  0.556516323858845 

r2 for test data:  0.5793967727383067
adjusted_r2 for test data:  0.5733881552059968


In [20]:
Table = insertData(Table, 'Lasso Regression', adj_R2(X_train,Y_train,lasso_reg), adj_R2(X_test,Y_test,lasso_reg), [{'alpha=2.4070688744491906'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}


## Ridge Regression

In [21]:
# Ridge Regularization
from sklearn.linear_model import RidgeCV

# alphas = [0.01,0.1,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2]
alphas = [0.01, 0.1, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3, 3.1, 3.2]
ridgecv = RidgeCV(alphas, cv =10)
ridgecv.fit(X_train,Y_train)

RidgeCV(alphas=array([0.01, 0.1 , 1.  , 1.1 , 1.2 , 1.3 , 1.4 , 1.5 , 1.6 , 1.7 , 1.8 ,
       1.9 , 2.  , 2.1 , 2.2 , 2.3 , 2.4 , 2.5 , 2.6 , 2.7 , 2.8 , 2.9 ,
       3.  , 3.1 , 3.2 ]),
        cv=10)

In [22]:
alpha = ridgecv.alpha_
print(alpha)

1.9


In [23]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=alpha)
ridge_reg.fit(X_train,Y_train)

Ridge(alpha=1.9)

In [24]:
print("r2 for train data: ",ridge_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,ridge_reg),'\n')

print("r2 for test data: ",ridge_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,ridge_reg))

r2 for train data:  0.5585682798167964
adjusted_r2 for train data:  0.5570086800399088 

r2 for test data:  0.5787104221402233
adjusted_r2 for test data:  0.5726919995993693


In [25]:
Table = insertData(Table, 'Ridge Regression', adj_R2(X_train,Y_train,ridge_reg), adj_R2(X_test,Y_test,ridge_reg), [{'alpha = 1.9'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}


## Decision Tree Regressor

In [26]:
HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
                   'max_depth':[8,10,12,14,16,18],
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10]}

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

decision_tree_reg = GridSearchCV(DecisionTreeRegressor(), param_grid = HyperParameters, cv = 5, n_jobs = -1, verbose = 3)

decision_tree_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 810 candidates, totalling 4050 fits


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': [8, 10, 12, 14, 16, 18],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18,
                                               20, 22, 24, 26, 28, 30]},
             verbose=3)

In [28]:
print(decision_tree_reg.best_score_)
print(decision_tree_reg.best_params_)

0.5696265247168996
{'max_depth': 8, 'min_samples_leaf': 10, 'min_samples_split': 30}


In [29]:
dt_reg = DecisionTreeRegressor(criterion='mse', max_depth=8, min_samples_leaf=10, min_samples_split=30)
dt_reg.fit(X_train,Y_train)

DecisionTreeRegressor(max_depth=8, min_samples_leaf=10, min_samples_split=30)

In [30]:
print("r2 for train data: ",dt_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,dt_reg),'\n')

print("r2 for test data: ",dt_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,dt_reg))

r2 for train data:  0.6387442593455788
adjusted_r2 for train data:  0.6374679252110718 

r2 for test data:  0.5750669371349899
adjusted_r2 for test data:  0.568996464808347


In [31]:
## Cross Validation
scores_lm = cross_val_score(dt_reg, X = X_train, y = Y_train, cv = 10, scoring='r2')

print(scores_lm)
print(scores_lm.mean())

[0.54650613 0.5903399  0.58928793 0.56327718 0.58367801 0.52331815
 0.5376002  0.61324752 0.55109135 0.58179448]
0.5680140844387973


In [32]:
Table = insertData(Table, 'Decision Tree Regressor', adj_R2(X_train,Y_train,dt_reg), adj_R2(X_test,Y_test,dt_reg), [{'max_depth=8, min_samples_leaf=10, min_samples_split=30'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."


## Random Forest Regressor

In [33]:
HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
                   'max_depth':[8,10,12,14,16,18],
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10],
                   'max_features':["auto", "sqrt", "log2"]}

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

random_forest_reg = RandomizedSearchCV(RandomForestRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

random_forest_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': [8, 10, 12, 14, 16, 18],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 3, 4, 5, 6, 7,
                                                             8, 9, 10],
                                        'min_samples_split': [2, 4, 6, 8, 10,
                                                              12, 14, 16, 18,
                                                              20, 22, 24, 26,
                                                              28, 30]},
                   random_state=42, verbose=3)

In [35]:
print(random_forest_reg.best_score_)
print(random_forest_reg.best_params_)

0.5903148710885169
{'min_samples_split': 30, 'min_samples_leaf': 10, 'max_features': 'auto', 'max_depth': 8}


In [36]:
rf_reg = DecisionTreeRegressor(min_samples_split=30, min_samples_leaf=10, max_features='auto', max_depth=8)
rf_reg.fit(X_train,Y_train)

DecisionTreeRegressor(max_depth=8, max_features='auto', min_samples_leaf=10,
                      min_samples_split=30)

In [37]:
print("r2 for train data: ",rf_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,rf_reg),'\n')

print("r2 for test data: ",rf_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,rf_reg))

r2 for train data:  0.6387442593455788
adjusted_r2 for train data:  0.6374679252110718 

r2 for test data:  0.5750669371349899
adjusted_r2 for test data:  0.568996464808347


In [38]:
## Cross Validation
scores_lm = cross_val_score(rf_reg, X = X_train, y = Y_train, cv = 10, scoring='r2')

print(scores_lm)
print(scores_lm.mean())

[0.54650613 0.5903399  0.58928793 0.56327718 0.58367801 0.52331815
 0.5376002  0.61324752 0.55109135 0.58179448]
0.5680140844387974


In [39]:
Table = insertData(Table,'Random Forest Regressor',adj_R2(X_train,Y_train,rf_reg),adj_R2(X_test,Y_test,rf_reg),[{'min_samples_split=30, min_samples_leaf=10, max_features=auto, max_depth=8'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.637468,0.568996,"{min_samples_split=30, min_samples_leaf=10, ma..."


## KNN Regressor

In [40]:
HyperParameters = {'weights':['uniform','distance'],
                   'algorithm':['auto','ball_tree','kd_tree','brute'],
                   'leaf_size':[20,25,30,35,40,45,50]}

# HyperParameters = {'weights':['uniform','distance'],
#                    'algorithm':['ball_tree','kd_tree'],
#                    'leaf_size':[20,25,30,35,40,45,50]}

In [41]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV

knn_reg = RandomizedSearchCV(KNeighborsRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

knn_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


RandomizedSearchCV(cv=5, estimator=KNeighborsRegressor(), n_iter=100, n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'leaf_size': [20, 25, 30, 35, 40, 45,
                                                      50],
                                        'weights': ['uniform', 'distance']},
                   random_state=42, verbose=3)

In [42]:
print(knn_reg.best_score_)
print(knn_reg.best_params_)

0.47428811901578616
{'weights': 'uniform', 'leaf_size': 20, 'algorithm': 'auto'}


In [43]:
knn_reg = KNeighborsRegressor(weights='uniform', algorithm='auto',leaf_size=20)
knn_reg.fit(X_train,Y_train)

KNeighborsRegressor(leaf_size=20)

In [44]:
print("r2 for train data: ",knn_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,knn_reg),'\n')

print("r2 for test data: ",knn_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,knn_reg))

r2 for train data:  0.6572316918500145
adjusted_r2 for train data:  0.656020674715376 

r2 for test data:  0.505997504464887
adjusted_r2 for test data:  0.4989403259572426


In [45]:
Table = insertData(Table,'KNN Regressor',adj_R2(X_train,Y_train,knn_reg),adj_R2(X_test,Y_test,knn_reg),[{'weights=uniform, algorithm=auto, leaf_size=20'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.637468,0.568996,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.656021,0.49894,"{weights=uniform, algorithm=auto, leaf_size=20}"


## Bagging Regressor

In [46]:
# Linear Regression
from sklearn.ensemble import BaggingRegressor

lr_bagging = BaggingRegressor(base_estimator=LinearRegression(), n_estimators=100, bootstrap=True,verbose=2,n_jobs=-1)
lr_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.5s finished


BaggingRegressor(base_estimator=LinearRegression(), n_estimators=100, n_jobs=-1,
                 verbose=2)

In [47]:
print("r2 for train data: ",lr_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,lr_bagging),'\n')

print("r2 for test data: ",lr_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,lr_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for train data:  0.5585721264892962


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


adjusted_r2 for train data:  0.55701254030289 

r2 for test data:  0.5789445260047792
adjusted_r2 for test data:  0.5729294478048474


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


In [48]:
Table = insertData(Table,'Bag Linear Regression',adj_R2(X_train,Y_train,lr_bagging),adj_R2(X_test,Y_test,lr_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.637468,0.568996,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.656021,0.49894,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.557013,0.572929,default


In [51]:
# Decision Tree Regressor
from sklearn.ensemble import BaggingRegressor

dt_bagging = BaggingRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=8, min_samples_leaf=10, min_samples_split=30), 
                              n_estimators=100, bootstrap=True,verbose=2, n_jobs=-1)
dt_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.6s finished


BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=8,
                                                      min_samples_leaf=10,
                                                      min_samples_split=30),
                 n_estimators=100, n_jobs=-1, verbose=2)

In [52]:
print("r2 for train data: ",dt_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,dt_bagging),'\n')

print("r2 for test data: ",dt_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,dt_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


r2 for train data:  0.6416866483219377


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


adjusted_r2 for train data:  0.6404207097910568 

r2 for test data:  0.6063540055530663
adjusted_r2 for test data:  0.6007304913466815


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


In [53]:
Table = insertData(Table,'Bag Decision Tree Regressor',adj_R2(X_train,Y_train,dt_bagging),adj_R2(X_test,Y_test,dt_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.637468,0.568996,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.656021,0.49894,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.557013,0.572929,default
7,Bag Decision Tree Regressor,0.640421,0.60073,default


In [116]:
from sklearn.svm import SVR

In [117]:
# # Decision Tree Regressor
# from sklearn.ensemble import BaggingRegressor

# svr_bagging = BaggingRegressor(base_estimator=SVR(), n_estimators=100, bootstrap=True,verbose=2)
# svr_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Building estimator 1 of 100 for this parallel run (total 100)...
Building estimator 2 of 100 for this parallel run (total 100)...
Building estimator 3 of 100 for this parallel run (total 100)...
Building estimator 4 of 100 for this parallel run (total 100)...
Building estimator 5 of 100 for this parallel run (total 100)...
Building estimator 6 of 100 for this parallel run (total 100)...
Building estimator 7 of 100 for this parallel run (total 100)...
Building estimator 8 of 100 for this parallel run (total 100)...
Building estimator 9 of 100 for this parallel run (total 100)...
Building estimator 10 of 100 for this parallel run (total 100)...
Building estimator 11 of 100 for this parallel run (total 100)...
Building estimator 12 of 100 for this parallel run (total 100)...
Building estimator 13 of 100 for this parallel run (total 100)...
Building estimator 14 of 100 for this parallel run (total 100)...
Building estimator 15 of 100 for this parallel run (total 100)...
Building estimator 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min finished


BaggingRegressor(base_estimator=SVR(), n_estimators=100, verbose=2)

In [118]:
# print("r2 for train data: ",svr_bagging.score(X_train,Y_train))
# print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,svr_bagging),'\n')

# print("r2 for test data: ",svr_bagging.score(X_test,Y_test))
# print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,svr_bagging))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


r2 for train data:  -0.007774727114581603


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


adjusted_r2 for train data:  -0.01118638721520493 



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


r2 for test data:  0.01819860776469384
adjusted_r2 for test data:  0.0047652752118014785


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min finished


In [54]:
# KNN Regressor
from sklearn.ensemble import BaggingRegressor

knn_bagging = BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100, bootstrap=True,verbose=2,n_jobs=-1)
knn_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.7s finished


BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100,
                 n_jobs=-1, verbose=2)

In [55]:
print("r2 for train data: ",knn_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,knn_bagging),'\n')

print("r2 for test data: ",knn_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,knn_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   53.8s finished


r2 for train data:  0.6767020353930275


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   46.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


adjusted_r2 for train data:  0.6755598079308507 



[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for test data:  0.524096644518812
adjusted_r2 for test data:  0.517298025154795


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   11.8s finished


In [56]:
Table = insertData(Table,'Bag KNN Regressor',adj_R2(X_train,Y_train,knn_bagging),adj_R2(X_test,Y_test,knn_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   44.8s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.5s finished


Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.637468,0.568996,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.656021,0.49894,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.557013,0.572929,default
7,Bag Decision Tree Regressor,0.640421,0.60073,default
8,Bag KNN Regressor,0.67556,0.517298,default


## Gradient Boosting

In [57]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boost_reg = GradientBoostingRegressor(random_state=42)
gradient_boost_reg.fit(X_train,Y_train)

GradientBoostingRegressor(random_state=42)

In [58]:
print("r2 for train data: ",gradient_boost_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,gradient_boost_reg),'\n')

print("r2 for test data: ",gradient_boost_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,gradient_boost_reg))

r2 for train data:  0.6298570616637339
adjusted_r2 for train data:  0.6285493286267738 

r2 for test data:  0.6001202228785218
adjusted_r2 for test data:  0.5944076546339294


In [59]:
Table = insertData(Table,'Gradient Boosting',adj_R2(X_train,Y_train,gradient_boost_reg),adj_R2(X_test,Y_test,gradient_boost_reg))
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.637468,0.568996,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.656021,0.49894,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.557013,0.572929,default
7,Bag Decision Tree Regressor,0.640421,0.60073,default
8,Bag KNN Regressor,0.67556,0.517298,default
9,Gradient Boosting,0.628549,0.594408,default


In [60]:
HyperParameters = {'learning_rate':[0.01,0.02,0.03,0.4,0.05,0.06,0.1],
                   'n_estimators':[50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
                   'max_leaf_nodes':[2, 5, 10, 20, 50, 100],
                   'max_depth':[8,10,12,14,16,18]
                  }

# HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
#                    'max_depth':[8,10,12,14,16,18],
#                    }

# param_distributions = {
#     "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
#     "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
#     "learning_rate": loguniform(0.01, 1),
# }

In [61]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

gradient_boost_reg = RandomizedSearchCV(GradientBoostingRegressor(random_state=0), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

gradient_boost_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=0),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'learning_rate': [0.01, 0.02, 0.03, 0.4,
                                                          0.05, 0.06, 0.1],
                                        'max_depth': [8, 10, 12, 14, 16, 18],
                                        'max_leaf_nodes': [2, 5, 10, 20, 50,
                                                           100],
                                        'n_estimators': [50, 100, 150, 200, 250,
                                                         300, 350, 400, 450,
                                                         500]},
                   random_state=42, verbose=3)

In [62]:
print(gradient_boost_reg.best_score_)
print(gradient_boost_reg.best_params_)

0.5951882007741295
{'n_estimators': 200, 'max_leaf_nodes': 5, 'max_depth': 16, 'learning_rate': 0.03}


In [63]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boost_reg = GradientBoostingRegressor(n_estimators=200, max_leaf_nodes=5, max_depth=16, learning_rate=0.03)
gradient_boost_reg.fit(X_train,Y_train)

GradientBoostingRegressor(learning_rate=0.03, max_depth=16, max_leaf_nodes=5,
                          n_estimators=200)

In [64]:
print("r2 for train data: ",gradient_boost_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,gradient_boost_reg),'\n')

print("r2 for test data: ",gradient_boost_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,gradient_boost_reg))

r2 for train data:  0.6100408935903007
adjusted_r2 for train data:  0.608663149065962 

r2 for test data:  0.6100232359610533
adjusted_r2 for test data:  0.6044521393319255


In [65]:
Table = insertData(Table,'Gradient Boosting_HyperParameters',adj_R2(X_train,Y_train,gradient_boost_reg),adj_R2(X_test,Y_test,gradient_boost_reg),[{'n_estimators=200, max_leaf_nodes=5, max_depth=16, learning_rate=0.03'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.637468,0.568996,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.656021,0.49894,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.557013,0.572929,default
7,Bag Decision Tree Regressor,0.640421,0.60073,default
8,Bag KNN Regressor,0.67556,0.517298,default
9,Gradient Boosting,0.628549,0.594408,default


## XGBRegressor

In [66]:
from xgboost import XGBRegressor

XGB_reg=XGBRegressor(objective ='reg:linear',n_estimators = 10, seed = 42)
XGB_reg.fit(X_train,Y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=10, n_jobs=4,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [67]:
print("r2 for train data: ",XGB_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,XGB_reg),'\n')

print("r2 for test data: ",XGB_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,XGB_reg))

r2 for train data:  0.6516452152134038
adjusted_r2 for train data:  0.650414460784598 

r2 for test data:  0.597499855132712
adjusted_r2 for test data:  0.5917498530631793


In [68]:
Table = insertData(Table,'XGBRegressor',adj_R2(X_train,Y_train,XGB_reg),adj_R2(X_test,Y_test,XGB_reg))
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.637468,0.568996,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.656021,0.49894,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.557013,0.572929,default
7,Bag Decision Tree Regressor,0.640421,0.60073,default
8,Bag KNN Regressor,0.67556,0.517298,default
9,Gradient Boosting,0.628549,0.594408,default


In [69]:
HyperParameters = {'n_estimators': [15,17,18,19,20,22,25,26,27,28,29,30],
                   'booster': ['gbtree'],
                   'max_depth': [3,4,5,6],
                   'min_child_weight': [74,75,78,79,80,81]
                  }

In [70]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

XGB_reg = RandomizedSearchCV(XGBRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

XGB_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=...
                                          predictor=None, random_state=None,
                                          reg_alpha=None, reg_lambda=None,
                                       

In [71]:
print(XGB_reg.best_score_)
print(XGB_reg.best_params_)

0.5936731725813602
{'n_estimators': 15, 'min_child_weight': 79, 'max_depth': 4, 'booster': 'gbtree'}


In [72]:
from xgboost import XGBRegressor

XGB_reg=XGBRegressor(objective ='reg:linear',n_estimators = 15, min_child_weight=79, max_depth=4, booster='gbtree',seed = 42)
XGB_reg.fit(X_train,Y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=4, min_child_weight=79, missing=nan,
             monotone_constraints='()', n_estimators=15, n_jobs=4,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [73]:
print("r2 for train data: ",XGB_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,XGB_reg),'\n')

print("r2 for test data: ",XGB_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,XGB_reg))

r2 for train data:  0.6152396118602799
adjusted_r2 for train data:  0.613880234660905 

r2 for test data:  0.6129509304515728
adjusted_r2 for test data:  0.6074216580294525


In [74]:
Table = insertData(Table,'XGBRegressor',adj_R2(X_train,Y_train,XGB_reg),adj_R2(X_test,Y_test,XGB_reg),[{'objective=reg:linear,n_estimators = 15, min_child_weight=79, max_depth=4, booster=gbtree, seed = 42'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.637468,0.568996,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.656021,0.49894,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.557013,0.572929,default
7,Bag Decision Tree Regressor,0.640421,0.60073,default
8,Bag KNN Regressor,0.67556,0.517298,default
9,Gradient Boosting,0.628549,0.594408,default


## Cat Boosting

In [75]:
from catboost import CatBoostRegressor

cat_reg = CatBoostRegressor()
cat_reg.fit(X_train,Y_train)

Learning rate set to 0.055449
0:	learn: 1666.8787623	total: 214ms	remaining: 3m 34s
1:	learn: 1620.5981381	total: 217ms	remaining: 1m 48s
2:	learn: 1575.8710870	total: 219ms	remaining: 1m 12s
3:	learn: 1533.0061194	total: 222ms	remaining: 55.2s
4:	learn: 1495.7482044	total: 224ms	remaining: 44.5s
5:	learn: 1461.8143376	total: 226ms	remaining: 37.4s
6:	learn: 1429.6589111	total: 229ms	remaining: 32.4s
7:	learn: 1400.3174239	total: 231ms	remaining: 28.6s
8:	learn: 1373.8713258	total: 233ms	remaining: 25.7s
9:	learn: 1348.4856340	total: 236ms	remaining: 23.3s
10:	learn: 1324.1824925	total: 238ms	remaining: 21.4s
11:	learn: 1302.0139053	total: 240ms	remaining: 19.8s
12:	learn: 1283.9568576	total: 242ms	remaining: 18.4s
13:	learn: 1266.2803661	total: 245ms	remaining: 17.3s
14:	learn: 1250.0638437	total: 248ms	remaining: 16.3s
15:	learn: 1237.3436617	total: 250ms	remaining: 15.4s
16:	learn: 1224.1773284	total: 252ms	remaining: 14.6s
17:	learn: 1210.6989756	total: 255ms	remaining: 13.9s
18:	l

154:	learn: 1050.0369588	total: 580ms	remaining: 3.16s
155:	learn: 1049.7854003	total: 584ms	remaining: 3.16s
156:	learn: 1049.4025210	total: 587ms	remaining: 3.15s
157:	learn: 1049.2236737	total: 590ms	remaining: 3.14s
158:	learn: 1048.7925109	total: 592ms	remaining: 3.13s
159:	learn: 1048.5461102	total: 595ms	remaining: 3.12s
160:	learn: 1048.2462343	total: 598ms	remaining: 3.12s
161:	learn: 1047.9600906	total: 601ms	remaining: 3.11s
162:	learn: 1047.7149763	total: 603ms	remaining: 3.1s
163:	learn: 1047.4501564	total: 606ms	remaining: 3.09s
164:	learn: 1047.2412369	total: 608ms	remaining: 3.08s
165:	learn: 1046.9392177	total: 611ms	remaining: 3.07s
166:	learn: 1046.5898788	total: 613ms	remaining: 3.06s
167:	learn: 1046.5796108	total: 615ms	remaining: 3.05s
168:	learn: 1046.2801035	total: 618ms	remaining: 3.04s
169:	learn: 1046.0966850	total: 620ms	remaining: 3.03s
170:	learn: 1045.8137719	total: 622ms	remaining: 3.02s
171:	learn: 1045.5854267	total: 624ms	remaining: 3s
172:	learn: 10

373:	learn: 994.8515861	total: 1.1s	remaining: 1.85s
374:	learn: 994.6795532	total: 1.11s	remaining: 1.85s
375:	learn: 994.5001301	total: 1.11s	remaining: 1.84s
376:	learn: 994.1565568	total: 1.11s	remaining: 1.84s
377:	learn: 993.8283760	total: 1.12s	remaining: 1.84s
378:	learn: 993.6347822	total: 1.12s	remaining: 1.83s
379:	learn: 993.5466592	total: 1.12s	remaining: 1.83s
380:	learn: 993.4148447	total: 1.12s	remaining: 1.82s
381:	learn: 993.2362783	total: 1.13s	remaining: 1.82s
382:	learn: 993.1519006	total: 1.13s	remaining: 1.82s
383:	learn: 992.9266164	total: 1.13s	remaining: 1.82s
384:	learn: 992.5481379	total: 1.14s	remaining: 1.81s
385:	learn: 992.1923391	total: 1.14s	remaining: 1.81s
386:	learn: 991.8311959	total: 1.14s	remaining: 1.81s
387:	learn: 991.7477524	total: 1.14s	remaining: 1.8s
388:	learn: 991.5478895	total: 1.15s	remaining: 1.8s
389:	learn: 991.3911090	total: 1.15s	remaining: 1.79s
390:	learn: 991.2377169	total: 1.15s	remaining: 1.79s
391:	learn: 991.0823113	total: 

584:	learn: 953.0333572	total: 1.63s	remaining: 1.16s
585:	learn: 952.9397770	total: 1.63s	remaining: 1.15s
586:	learn: 952.7018581	total: 1.64s	remaining: 1.15s
587:	learn: 952.5168872	total: 1.64s	remaining: 1.15s
588:	learn: 952.2092537	total: 1.64s	remaining: 1.15s
589:	learn: 952.1221478	total: 1.64s	remaining: 1.14s
590:	learn: 951.9410733	total: 1.65s	remaining: 1.14s
591:	learn: 951.8494196	total: 1.65s	remaining: 1.14s
592:	learn: 951.7278523	total: 1.65s	remaining: 1.13s
593:	learn: 951.5197113	total: 1.66s	remaining: 1.13s
594:	learn: 951.3722884	total: 1.66s	remaining: 1.13s
595:	learn: 951.0968448	total: 1.66s	remaining: 1.13s
596:	learn: 950.8601853	total: 1.66s	remaining: 1.12s
597:	learn: 950.6235128	total: 1.67s	remaining: 1.12s
598:	learn: 950.2136397	total: 1.67s	remaining: 1.12s
599:	learn: 949.9229812	total: 1.67s	remaining: 1.11s
600:	learn: 949.8202993	total: 1.67s	remaining: 1.11s
601:	learn: 949.6598338	total: 1.68s	remaining: 1.11s
602:	learn: 949.5823909	tota

772:	learn: 922.6699630	total: 2.17s	remaining: 636ms
773:	learn: 922.5543978	total: 2.17s	remaining: 633ms
774:	learn: 922.3633667	total: 2.17s	remaining: 630ms
775:	learn: 922.2371760	total: 2.17s	remaining: 627ms
776:	learn: 922.1193970	total: 2.17s	remaining: 624ms
777:	learn: 922.0049738	total: 2.18s	remaining: 622ms
778:	learn: 921.7896954	total: 2.18s	remaining: 619ms
779:	learn: 921.4795266	total: 2.18s	remaining: 616ms
780:	learn: 921.0860322	total: 2.19s	remaining: 613ms
781:	learn: 920.8330195	total: 2.19s	remaining: 610ms
782:	learn: 920.6834869	total: 2.19s	remaining: 607ms
783:	learn: 920.4342601	total: 2.19s	remaining: 604ms
784:	learn: 920.3494232	total: 2.2s	remaining: 602ms
785:	learn: 920.2112188	total: 2.2s	remaining: 599ms
786:	learn: 920.1597008	total: 2.2s	remaining: 596ms
787:	learn: 920.0313195	total: 2.2s	remaining: 593ms
788:	learn: 919.8576245	total: 2.21s	remaining: 590ms
789:	learn: 919.5817322	total: 2.21s	remaining: 587ms
790:	learn: 919.3699451	total: 2

980:	learn: 893.2688510	total: 2.68s	remaining: 52ms
981:	learn: 893.1176647	total: 2.69s	remaining: 49.3ms
982:	learn: 892.9826027	total: 2.69s	remaining: 46.5ms
983:	learn: 892.7657790	total: 2.69s	remaining: 43.8ms
984:	learn: 892.6250358	total: 2.69s	remaining: 41ms
985:	learn: 892.4997678	total: 2.7s	remaining: 38.3ms
986:	learn: 892.3284432	total: 2.7s	remaining: 35.6ms
987:	learn: 892.1873565	total: 2.7s	remaining: 32.8ms
988:	learn: 892.1094365	total: 2.71s	remaining: 30.1ms
989:	learn: 891.9701707	total: 2.71s	remaining: 27.4ms
990:	learn: 891.8360545	total: 2.71s	remaining: 24.6ms
991:	learn: 891.7488540	total: 2.71s	remaining: 21.9ms
992:	learn: 891.5515895	total: 2.71s	remaining: 19.1ms
993:	learn: 891.3688012	total: 2.72s	remaining: 16.4ms
994:	learn: 891.2522944	total: 2.72s	remaining: 13.7ms
995:	learn: 891.1926835	total: 2.72s	remaining: 10.9ms
996:	learn: 891.0611475	total: 2.73s	remaining: 8.2ms
997:	learn: 890.8996232	total: 2.73s	remaining: 5.46ms
998:	learn: 890.76

<catboost.core.CatBoostRegressor at 0x17492e493a0>

In [76]:
print("r2 for train data: ",cat_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,cat_reg),'\n')

print("r2 for test data: ",cat_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,cat_reg))

r2 for train data:  0.7318385598976221
adjusted_r2 for train data:  0.7308911324631371 

r2 for test data:  0.5754376925963387
adjusted_r2 for test data:  0.5693725167762864


In [77]:
HyperParameters = {'depth':[2,3,4,5,6,7],
                   'iterations':[145,150,155,156,157],
                   'learning_rate':[0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6]}

In [78]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

cat_reg = RandomizedSearchCV(CatBoostRegressor(), param_distributions=HyperParameters, n_iter=100, cv=5, n_jobs=-1, verbose=2, 
                            return_train_score=False)

cat_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
0:	learn: 1677.8023676	total: 6.76ms	remaining: 1.04s
1:	learn: 1639.0911030	total: 13.9ms	remaining: 1.06s
2:	learn: 1602.4521507	total: 20.7ms	remaining: 1.05s
3:	learn: 1570.7511978	total: 27.3ms	remaining: 1.03s
4:	learn: 1538.4105504	total: 33.7ms	remaining: 1.01s
5:	learn: 1508.9288886	total: 39.9ms	remaining: 991ms
6:	learn: 1480.7881192	total: 45.1ms	remaining: 953ms
7:	learn: 1454.5359239	total: 49.2ms	remaining: 904ms
8:	learn: 1430.6323314	total: 52.2ms	remaining: 846ms
9:	learn: 1408.3932106	total: 55.1ms	remaining: 800ms
10:	learn: 1387.6161084	total: 57.6ms	remaining: 754ms
11:	learn: 1367.3419743	total: 59.9ms	remaining: 714ms
12:	learn: 1348.1309959	total: 62.3ms	remaining: 681ms
13:	learn: 1331.3689433	total: 64.3ms	remaining: 648ms
14:	learn: 1315.0022069	total: 66.3ms	remaining: 619ms
15:	learn: 1299.2765403	total: 68.4ms	remaining: 594ms
16:	learn: 1285.0475534	total: 70.1ms	remaining: 569ms
17:	learn: 1

RandomizedSearchCV(cv=5,
                   estimator=<catboost.core.CatBoostRegressor object at 0x000001749481B4C0>,
                   n_iter=100, n_jobs=-1,
                   param_distributions={'depth': [2, 3, 4, 5, 6, 7],
                                        'iterations': [145, 150, 155, 156, 157],
                                        'learning_rate': [0.01, 0.05, 0.1, 0.2,
                                                          0.3, 0.4, 0.5, 0.6]},
                   verbose=2)

In [79]:
print(cat_reg.best_score_)
print(cat_reg.best_params_)

0.5970678342850675
{'learning_rate': 0.05, 'iterations': 155, 'depth': 4}


In [80]:
from catboost import CatBoostRegressor

cat_reg = CatBoostRegressor(learning_rate=0.05, iterations=145, depth=5)
cat_reg.fit(X_train,Y_train)

0:	learn: 1673.6673078	total: 9.56ms	remaining: 1.38s
1:	learn: 1630.9402181	total: 18.3ms	remaining: 1.31s
2:	learn: 1591.3271667	total: 24.1ms	remaining: 1.14s
3:	learn: 1557.3525297	total: 27.9ms	remaining: 982ms
4:	learn: 1521.7237968	total: 31.1ms	remaining: 871ms
5:	learn: 1488.4189799	total: 34ms	remaining: 788ms
6:	learn: 1458.1445832	total: 36.8ms	remaining: 726ms
7:	learn: 1431.8753369	total: 39.3ms	remaining: 672ms
8:	learn: 1407.2673301	total: 41.4ms	remaining: 626ms
9:	learn: 1385.8630318	total: 43.5ms	remaining: 587ms
10:	learn: 1362.5996688	total: 45.5ms	remaining: 554ms
11:	learn: 1340.3480683	total: 47.3ms	remaining: 524ms
12:	learn: 1321.1301491	total: 49.1ms	remaining: 499ms
13:	learn: 1302.8729696	total: 50.9ms	remaining: 477ms
14:	learn: 1285.3787733	total: 52.7ms	remaining: 457ms
15:	learn: 1272.4974919	total: 54.5ms	remaining: 440ms
16:	learn: 1257.6113730	total: 56.4ms	remaining: 425ms
17:	learn: 1244.3843477	total: 58.2ms	remaining: 411ms
18:	learn: 1234.239898

<catboost.core.CatBoostRegressor at 0x17492a4db20>

In [81]:
print("r2 for train data: ",cat_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,cat_reg),'\n')

print("r2 for test data: ",cat_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,cat_reg))

r2 for train data:  0.6162619468375072
adjusted_r2 for train data:  0.6149061815974219 

r2 for test data:  0.617954560975548
adjusted_r2 for test data:  0.6124967689894845


In [82]:
Table = insertData(Table,'Cat Boost Regressor',adj_R2(X_train,Y_train,cat_reg),adj_R2(X_test,Y_test,cat_reg),[{'learning_rate=0.05, iterations=145, depth=5'}])
Table

Unnamed: 0,Model_Name,Adjusted_R2_Train,Adjusted_R2_Test,Parameters
0,Linear Regression,0.557034,0.572618,default
1,Lasso Regression,0.556516,0.573388,{alpha=2.4070688744491906}
2,Ridge Regression,0.557009,0.572692,{alpha = 1.9}
3,Decision Tree Regressor,0.637468,0.568996,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.637468,0.568996,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,KNN Regressor,0.656021,0.49894,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.557013,0.572929,default
7,Bag Decision Tree Regressor,0.640421,0.60073,default
8,Bag KNN Regressor,0.67556,0.517298,default
9,Gradient Boosting,0.628549,0.594408,default


In [88]:
# Table.to_csv("Data_Set/Result2.csv",index = False)

In [90]:
# import pickle
# file = open('models/cat_boost_reg.pickle','wb')

# pickle.dump(cat_reg, file)
# file.close()

In [97]:
# import pickle
# file = open('Data_Set/Gradientboost_model.pickle','wb')

# pickle.dump(gradient_boost_reg, file)
# file.close()

In [11]:
# import pickle
# file = open('models/minmax_transformation.pickle','wb')

# pickle.dump(minmax, file)
# file.close()