In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
## To see all columns of the dataset

pd.set_option("display.max_columns", None)

In [3]:
Table = pd.DataFrame(columns=['Model_Name','Train_R2','Test_R2','Parameters'])
def insertData(Table, model_name, train_R2, test_R2, parameter='default'):
    temp = pd.DataFrame()
    temp['Model_Name'] = [model_name]
    temp['Train_R2'] = train_R2
    temp['Test_R2'] = test_R2
    temp['Parameters'] = parameter
    Table = pd.concat([temp,Table],ignore_index=True)
    
    return Table

In [4]:
train_df = pd.read_csv('Data_Set/preprocessed_data.csv')
train_df.head()

Unnamed: 0,Item_Weight,Item_MRP,Item_Outlet_Sales,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,9.3,249.8092,3735.138,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,5.92,48.2692,443.4228,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0
2,17.5,141.618,2097.27,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
3,19.2,182.095,732.38,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,8.93,53.8614,994.7052,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0


# Model Building

<ol>
    <li>Linear Regression</li>
    <li>Ridge Regression</li>
    <li>Lasso Regression</li>
    <li>Decision Tree Regressor</li>
    <li>Random Forest Regressor</li>
    <li>KNN Regressor</li>
    <li>Bagging</li>
    <li>Boosting</li>
    <li>Stacking</li>
</ol>

In [5]:
# Standarization only the numerical freature using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()

df_standard = minmax.fit_transform(train_df[['Item_Weight','Item_MRP']])

In [6]:
df_standard

array([[0.28252456, 0.92750715],
       [0.08127419, 0.0720684 ],
       [0.77076511, 0.46828841],
       ...,
       [0.35992855, 0.22849221],
       [0.15808276, 0.30493925],
       [0.61000298, 0.18750976]])

In [7]:
train_df['Item_Weight'] = df_standard[:,0]
train_df['Item_MRP'] = df_standard[:,1]

train_df.head()

Unnamed: 0,Item_Weight,Item_MRP,Item_Outlet_Sales,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
0,0.282525,0.927507,3735.138,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0.081274,0.072068,443.4228,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0
2,0.770765,0.468288,2097.27,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
3,0.871986,0.640093,732.38,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0.260494,0.095805,994.7052,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0


## X, y Split

In [8]:
X = train_df.drop(['Item_Outlet_Sales'],axis = 1)
Y = train_df['Item_Outlet_Sales']

## Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state = 42)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(6818, 23) (1705, 23) (6818,) (1705,)


In [10]:
X_train.head()

Unnamed: 0,Item_Weight,Item_MRP,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
549,0.294433,0.594464,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
7757,0.800536,0.591057,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0
764,0.776719,0.341387,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0
6867,0.224472,0.043819,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0
2716,0.493897,0.527478,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


## Linear Regression

In [11]:
# Let's create a function to create adjusted R-Squared
def adj_R2(x,y,regression):
    r2 = regression.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [12]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
linear_reg.fit(X_train,Y_train)

LinearRegression()

In [13]:
print("R2 for train data: ",linear_reg.score(X_train,Y_train))
print("Adjusted R2 for train data: ",adj_R2(X_train,Y_train,linear_reg),'\n')

print("R2 for test data: ",linear_reg.score(X_test,Y_test))
print("Adjusted R2 for test data: ",adj_R2(X_test,Y_test,linear_reg))

R2 for train data:  0.5583059210354264
Adjusted R2 for train data:  0.556810636399544 

R2 for test data:  0.5783703769266026
Adjusted R2 for test data:  0.572601500465753


In [14]:
#from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

model = LinearRegression()
scores_lr = cross_val_score(model, X=X_train , y=Y_train, cv=10)

print(scores_lr)
print(scores_lr.mean())

[0.53256451 0.5832018  0.56415725 0.55384907 0.57458206 0.53026454
 0.54011719 0.57763835 0.54099286 0.54368191]
0.5541049547592622


In [15]:
Table = insertData(Table, 'Linear Regression', adj_R2(X_train,Y_train,linear_reg), adj_R2(X_test,Y_test,linear_reg))
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.556811,0.572602,default


## Lasso Regression

In [16]:
# Lasso Regularization
from sklearn.linear_model import LassoCV

lasscv = LassoCV(alphas=None,cv=10, max_iter=100000)
lasscv.fit(X_train,Y_train)

LassoCV(cv=10, max_iter=100000)

In [17]:
alpha = lasscv.alpha_
print(alpha)

2.581021055193149


In [18]:
#now that we have best parameter, let's use Lasso regression and see how well our data has fitted before
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha)
lasso_reg.fit(X_train,Y_train)

Lasso(alpha=2.581021055193149)

In [19]:
print("r2 for train data: ",lasso_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,lasso_reg),'\n')

print("r2 for test data: ",lasso_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,lasso_reg))

r2 for train data:  0.5577766492833689
adjusted_r2 for train data:  0.5562795728826502 

r2 for test data:  0.5797519137237708
adjusted_r2 for test data:  0.5740019399079747


In [20]:
Table = insertData(Table, 'Lasso Regression', adj_R2(X_train,Y_train,lasso_reg), adj_R2(X_test,Y_test,lasso_reg), [{'alpha':'2.581021055193149'}])

## Ridge Regression

In [21]:
# Ridge Regularization
from sklearn.linear_model import RidgeCV

#alphas = [0.01,0.1,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2]
alphas = [2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3, 3.1, 3.2]
ridgecv = RidgeCV(alphas, cv =10)
ridgecv.fit(X_train,Y_train)

RidgeCV(alphas=array([2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2]),
        cv=10)

In [22]:
alpha = ridgecv.alpha_
print(alpha)

2.1


In [23]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=alpha)
ridge_reg.fit(X_train,Y_train)

Ridge(alpha=2.1)

In [24]:
print("r2 for train data: ",ridge_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,ridge_reg),'\n')

print("r2 for test data: ",ridge_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,ridge_reg))

r2 for train data:  0.5582818284712899
adjusted_r2 for train data:  0.5567864622738863 

r2 for test data:  0.5784693180915954
adjusted_r2 for test data:  0.5727017953766083


In [25]:
Table = insertData(Table, 'Ridge Regression', adj_R2(X_train,Y_train,ridge_reg), adj_R2(X_test,Y_test,ridge_reg), [{'alpha = 2.1'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Ridge Regression,0.556786,0.572702,{alpha = 2.1}
1,Lasso Regression,0.55628,0.574002,{'alpha': '2.581021055193149'}
2,Linear Regression,0.556811,0.572602,default


## Decision Tree Regressor

In [26]:
HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
                   'max_depth':[8,10,12,14,16,18],
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10]}

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# decision_tree_reg = RandomizedSearchCV(DecisionTreeRegressor(), param_distributions = HyperParameters, cv = 5,
#                             n_iter = 100, n_jobs = -1, verbose = 2,random_state = 42)
decision_tree_reg = GridSearchCV(DecisionTreeRegressor(), param_grid = HyperParameters, cv = 5,
                                 n_jobs = -1, verbose = 3)

decision_tree_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 810 candidates, totalling 4050 fits


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': [8, 10, 12, 14, 16, 18],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18,
                                               20, 22, 24, 26, 28, 30]},
             verbose=3)

In [28]:
print(decision_tree_reg.best_score_)
print(decision_tree_reg.best_params_)

0.5651761920615821
{'max_depth': 8, 'min_samples_leaf': 10, 'min_samples_split': 28}


In [29]:
dt_reg = DecisionTreeRegressor(criterion='mse', max_depth=8, min_samples_leaf=10, min_samples_split=28)
dt_reg.fit(X_train,Y_train)

DecisionTreeRegressor(max_depth=8, min_samples_leaf=10, min_samples_split=28)

In [30]:
print("r2 for train data: ",dt_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,dt_reg),'\n')

print("r2 for test data: ",dt_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,dt_reg))

r2 for train data:  0.6400354157955457
adjusted_r2 for train data:  0.6388168132879357 

r2 for test data:  0.5727352942781683
adjusted_r2 for test data:  0.5668893167459838


In [33]:
## Cross Validation
scores_lm = cross_val_score(dt_reg, X = X_train, y = Y_train, cv = 10, scoring='r2')

print(scores_lm)
print(scores_lm.mean())

[0.5296681  0.58801506 0.58647877 0.55639644 0.5841285  0.51452217
 0.54434765 0.60679078 0.54645629 0.57704048]
0.5633844233284246


In [34]:
Table = insertData(Table, 'Decision Tree Regressor', adj_R2(X_train,Y_train,dt_reg), adj_R2(X_test,Y_test,dt_reg), [{'max_depth=8, min_samples_leaf=10, min_samples_split=28'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Decision Tree Regressor,0.638817,0.566889,"{max_depth=8, min_samples_leaf=10, min_samples..."
1,Ridge Regression,0.556786,0.572702,{alpha = 2.1}
2,Lasso Regression,0.55628,0.574002,{'alpha': '2.581021055193149'}
3,Linear Regression,0.556811,0.572602,default


## Random Forest Regressor

In [31]:
HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
                   'max_depth':[8,10,12,14,16,18],
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10],
                   'max_features':["auto", "sqrt", "log2"]}

In [32]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

random_forest_reg = RandomizedSearchCV(RandomForestRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

random_forest_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': [8, 10, 12, 14, 16, 18],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 3, 4, 5, 6, 7,
                                                             8, 9, 10],
                                        'min_samples_split': [2, 4, 6, 8, 10,
                                                              12, 14, 16, 18,
                                                              20, 22, 24, 26,
                                                              28, 30]},
                   random_state=42, verbose=3)

In [35]:
print(random_forest_reg.best_score_)
print(random_forest_reg.best_params_)

0.5896722726077005
{'min_samples_split': 30, 'min_samples_leaf': 10, 'max_features': 'auto', 'max_depth': 8}


In [36]:
rf_reg = DecisionTreeRegressor(min_samples_split=30, min_samples_leaf=10, max_features='auto', max_depth=8)
rf_reg.fit(X_train,Y_train)

DecisionTreeRegressor(max_depth=8, max_features='auto', min_samples_leaf=10,
                      min_samples_split=30)

In [37]:
print("r2 for train data: ",rf_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,rf_reg),'\n')

print("r2 for test data: ",rf_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,rf_reg))

r2 for train data:  0.6393394886745156
adjusted_r2 for train data:  0.6381185302169816 

r2 for test data:  0.5728058992117855
adjusted_r2 for test data:  0.5669608877197396


In [38]:
## Cross Validation
scores_lm = cross_val_score(rf_reg, X = X_train, y = Y_train, cv = 10, scoring='r2')

print(scores_lm)
print(scores_lm.mean())

[0.52836628 0.58657114 0.58864424 0.55687762 0.58415991 0.51278981
 0.54448525 0.60722798 0.54613248 0.57704048]
0.5632295178824395


In [39]:
Table = insertData(Table,'Random Forest Regressor',adj_R2(X_train,Y_train,rf_reg),adj_R2(X_test,Y_test,rf_reg),[{'min_samples_split=30, min_samples_leaf=10, max_features=auto, max_depth=8'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Random Forest Regressor,0.638119,0.566961,"{min_samples_split=30, min_samples_leaf=10, ma..."
1,Decision Tree Regressor,0.638817,0.566889,"{max_depth=8, min_samples_leaf=10, min_samples..."
2,Ridge Regression,0.556786,0.572702,{alpha = 2.1}
3,Lasso Regression,0.55628,0.574002,{'alpha': '2.581021055193149'}
4,Linear Regression,0.556811,0.572602,default


## KNN Regressor

In [40]:
HyperParameters = {'weights':['uniform','distance'],
                   'algorithm':['auto','ball_tree','kd_tree','brute']}

# HyperParameters = {'weights':['uniform','distance'],
#                    'algorithm':['ball_tree','kd_tree'],
#                    'leaf_size':[20,25,30,35,40,45,50]}

In [41]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV

knn_reg = RandomizedSearchCV(KNeighborsRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

knn_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


RandomizedSearchCV(cv=5, estimator=KNeighborsRegressor(), n_iter=100, n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'weights': ['uniform', 'distance']},
                   random_state=42, verbose=3)

In [42]:
print(knn_reg.best_score_)
print(knn_reg.best_params_)

0.4711895166675132
{'weights': 'uniform', 'algorithm': 'auto'}


In [43]:
knn_reg = KNeighborsRegressor(weights='uniform', algorithm='auto')
knn_reg.fit(X_train,Y_train)

KNeighborsRegressor()

In [45]:
print("r2 for train data: ",knn_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,knn_reg),'\n')

print("r2 for test data: ",knn_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,knn_reg))

r2 for train data:  0.6532223654491762
adjusted_r2 for train data:  0.6520484052497844 

r2 for test data:  0.5060501296588713
adjusted_r2 for test data:  0.49929174356854056


In [46]:
Table = insertData(Table,'KNN Regressor',adj_R2(X_train,Y_train,knn_reg),adj_R2(X_test,Y_test,knn_reg),[{'weights=uniform, algorithm=auto'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,KNN Regressor,0.652048,0.499292,"{weights=uniform, algorithm=auto}"
1,Random Forest Regressor,0.638119,0.566961,"{min_samples_split=30, min_samples_leaf=10, ma..."
2,Decision Tree Regressor,0.638817,0.566889,"{max_depth=8, min_samples_leaf=10, min_samples..."
3,Ridge Regression,0.556786,0.572702,{alpha = 2.1}
4,Lasso Regression,0.55628,0.574002,{'alpha': '2.581021055193149'}
5,Linear Regression,0.556811,0.572602,default


## Bagging Regressor

In [48]:
# Linear Regression
from sklearn.ensemble import BaggingRegressor

lr_bagging = BaggingRegressor(base_estimator=LinearRegression(), n_estimators=100, bootstrap=True,verbose=2,n_jobs=-1)
lr_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    3.3s finished


BaggingRegressor(base_estimator=LinearRegression(), n_estimators=100, n_jobs=-1,
                 verbose=2)

In [49]:
print("r2 for train data: ",lr_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,lr_bagging),'\n')

print("r2 for test data: ",lr_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,lr_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


r2 for train data:  0.5582774046420279


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished


adjusted_r2 for train data:  0.5567820234684581 

r2 for test data:  0.5782219822879399
adjusted_r2 for test data:  0.5724510754423852


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


In [50]:
Table = insertData(Table,'Bag Linear Regression',adj_R2(X_train,Y_train,lr_bagging),adj_R2(X_test,Y_test,lr_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Bag Linear Regression,0.556782,0.572451,default
1,KNN Regressor,0.652048,0.499292,"{weights=uniform, algorithm=auto}"
2,Random Forest Regressor,0.638119,0.566961,"{min_samples_split=30, min_samples_leaf=10, ma..."
3,Decision Tree Regressor,0.638817,0.566889,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Ridge Regression,0.556786,0.572702,{alpha = 2.1}
5,Lasso Regression,0.55628,0.574002,{'alpha': '2.581021055193149'}
6,Linear Regression,0.556811,0.572602,default


In [51]:
# Decision Tree Regressor
from sklearn.ensemble import BaggingRegressor

dt_bagging = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100, bootstrap=True,verbose=2, n_jobs=-1)
dt_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.3s finished


BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100,
                 n_jobs=-1, verbose=2)

In [52]:
print("r2 for train data: ",dt_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,dt_bagging),'\n')

print("r2 for test data: ",dt_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,dt_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for train data:  0.9328200482307318


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


adjusted_r2 for train data:  0.9325926212524137 



[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for test data:  0.5262666833270343
adjusted_r2 for test data:  0.5197849068347807


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished


In [53]:
Table = insertData(Table,'Bag Decision Tree Regressor',adj_R2(X_train,Y_train,dt_bagging),adj_R2(X_test,Y_test,dt_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.3s finished


Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Bag Decision Tree Regressor,0.932593,0.519785,default
1,Bag Linear Regression,0.556782,0.572451,default
2,KNN Regressor,0.652048,0.499292,"{weights=uniform, algorithm=auto}"
3,Random Forest Regressor,0.638119,0.566961,"{min_samples_split=30, min_samples_leaf=10, ma..."
4,Decision Tree Regressor,0.638817,0.566889,"{max_depth=8, min_samples_leaf=10, min_samples..."
5,Ridge Regression,0.556786,0.572702,{alpha = 2.1}
6,Lasso Regression,0.55628,0.574002,{'alpha': '2.581021055193149'}
7,Linear Regression,0.556811,0.572602,default


In [116]:
from sklearn.svm import SVR

In [117]:
# # Decision Tree Regressor
# from sklearn.ensemble import BaggingRegressor

# svr_bagging = BaggingRegressor(base_estimator=SVR(), n_estimators=100, bootstrap=True,verbose=2)
# svr_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Building estimator 1 of 100 for this parallel run (total 100)...
Building estimator 2 of 100 for this parallel run (total 100)...
Building estimator 3 of 100 for this parallel run (total 100)...
Building estimator 4 of 100 for this parallel run (total 100)...
Building estimator 5 of 100 for this parallel run (total 100)...
Building estimator 6 of 100 for this parallel run (total 100)...
Building estimator 7 of 100 for this parallel run (total 100)...
Building estimator 8 of 100 for this parallel run (total 100)...
Building estimator 9 of 100 for this parallel run (total 100)...
Building estimator 10 of 100 for this parallel run (total 100)...
Building estimator 11 of 100 for this parallel run (total 100)...
Building estimator 12 of 100 for this parallel run (total 100)...
Building estimator 13 of 100 for this parallel run (total 100)...
Building estimator 14 of 100 for this parallel run (total 100)...
Building estimator 15 of 100 for this parallel run (total 100)...
Building estimator 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min finished


BaggingRegressor(base_estimator=SVR(), n_estimators=100, verbose=2)

In [118]:
# print("r2 for train data: ",svr_bagging.score(X_train,Y_train))
# print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,svr_bagging),'\n')

# print("r2 for test data: ",svr_bagging.score(X_test,Y_test))
# print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,svr_bagging))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


r2 for train data:  -0.007774727114581603


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


adjusted_r2 for train data:  -0.01118638721520493 



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


r2 for test data:  0.01819860776469384
adjusted_r2 for test data:  0.0047652752118014785


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min finished


In [55]:
# KNN Regressor
from sklearn.ensemble import BaggingRegressor

knn_bagging = BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100, bootstrap=True,verbose=2,n_jobs=-1)
knn_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.5s finished


BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100,
                 n_jobs=-1, verbose=2)

In [56]:
print("r2 for train data: ",knn_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,knn_bagging),'\n')

print("r2 for test data: ",knn_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,knn_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   45.8s finished


r2 for train data:  0.6726079730924011


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   46.4s finished


adjusted_r2 for train data:  0.6714996397661022 



[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.6s finished


r2 for test data:  0.5209609684790683


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   12.6s finished


adjusted_r2 for test data:  0.5144065974350579


In [57]:
Table = insertData(Table,'Bag KNN Regressor',adj_R2(X_train,Y_train,knn_bagging),adj_R2(X_test,Y_test,knn_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   45.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   11.9s finished


Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Bag KNN Regressor,0.6715,0.514407,default
1,Bag Decision Tree Regressor,0.932593,0.519785,default
2,Bag Linear Regression,0.556782,0.572451,default
3,KNN Regressor,0.652048,0.499292,"{weights=uniform, algorithm=auto}"
4,Random Forest Regressor,0.638119,0.566961,"{min_samples_split=30, min_samples_leaf=10, ma..."
5,Decision Tree Regressor,0.638817,0.566889,"{max_depth=8, min_samples_leaf=10, min_samples..."
6,Ridge Regression,0.556786,0.572702,{alpha = 2.1}
7,Lasso Regression,0.55628,0.574002,{'alpha': '2.581021055193149'}
8,Linear Regression,0.556811,0.572602,default


## Gradient Boosting

In [60]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boost_reg = GradientBoostingRegressor(random_state=42)
gradient_boost_reg.fit(X_train,Y_train)

GradientBoostingRegressor(random_state=42)

In [61]:
print("r2 for train data: ",gradient_boost_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,gradient_boost_reg),'\n')

print("r2 for test data: ",gradient_boost_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,gradient_boost_reg))

r2 for train data:  0.6259275568763688
adjusted_r2 for train data:  0.6246611944695624 

r2 for test data:  0.6023717761780724
adjusted_r2 for test data:  0.5969312948289323


In [None]:
Table = insertData(Table,'Gradient Boosting',adj_R2(X_train,Y_train,gradient_boost_reg),adj_R2(X_test,Y_test,gradient_boost_reg))
Table

In [66]:
HyperParameters = {'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                   'n_estimators':[50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
                   'max_leaf_nodes':[2, 5, 10, 20, 50, 100],
                   'max_depth':[8,10,12,14,16,18]
                  }

# HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
#                    'max_depth':[8,10,12,14,16,18],
#                    }

# param_distributions = {
#     "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
#     "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
#     "learning_rate": loguniform(0.01, 1),
# }

In [67]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

gradient_boost_reg = RandomizedSearchCV(GradientBoostingRegressor(random_state=0), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

gradient_boost_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=0),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'learning_rate': [0.1, 0.2, 0.3, 0.4,
                                                          0.5, 0.6, 0.7, 0.8,
                                                          0.9],
                                        'max_depth': [8, 10, 12, 14, 16, 18],
                                        'max_leaf_nodes': [2, 5, 10, 20, 50,
                                                           100],
                                        'n_estimators': [50, 100, 150, 200, 250,
                                                         300, 350, 400, 450,
                                                         500]},
                   random_state=42, verbose=3)

In [68]:
print(gradient_boost_reg.best_score_)
print(gradient_boost_reg.best_params_)

0.5916709989876346
{'n_estimators': 50, 'max_leaf_nodes': 5, 'max_depth': 16, 'learning_rate': 0.2}


In [69]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boost_reg = GradientBoostingRegressor(n_estimators=50, max_leaf_nodes=5, max_depth=16, learning_rate=0.2)
gradient_boost_reg.fit(X_train,Y_train)

GradientBoostingRegressor(learning_rate=0.2, max_depth=16, max_leaf_nodes=5,
                          n_estimators=50)

In [70]:
print("r2 for train data: ",gradient_boost_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,gradient_boost_reg),'\n')

print("r2 for test data: ",gradient_boost_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,gradient_boost_reg))

r2 for train data:  0.6183314254896173
adjusted_r2 for train data:  0.6170393475953372 

r2 for test data:  0.6081124011376817
adjusted_r2 for test data:  0.6027504649248124


In [71]:
Table = insertData(Table,'Gradient Boosting',adj_R2(X_train,Y_train,gradient_boost_reg),adj_R2(X_test,Y_test,gradient_boost_reg),[{'n_estimators=50, max_leaf_nodes=5, max_depth=16, learning_rate=0.2'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Gradient Boosting,0.617039,0.60275,"{n_estimators=50, max_leaf_nodes=5, max_depth=..."
1,Bag KNN Regressor,0.6715,0.514407,default
2,Bag Decision Tree Regressor,0.932593,0.519785,default
3,Bag Linear Regression,0.556782,0.572451,default
4,KNN Regressor,0.652048,0.499292,"{weights=uniform, algorithm=auto}"
5,Random Forest Regressor,0.638119,0.566961,"{min_samples_split=30, min_samples_leaf=10, ma..."
6,Decision Tree Regressor,0.638817,0.566889,"{max_depth=8, min_samples_leaf=10, min_samples..."
7,Ridge Regression,0.556786,0.572702,{alpha = 2.1}
8,Lasso Regression,0.55628,0.574002,{'alpha': '2.581021055193149'}
9,Linear Regression,0.556811,0.572602,default


## XGBRegressor

In [72]:
from xgboost import XGBRegressor

XGB_reg=XGBRegressor(objective ='reg:linear',n_estimators = 10, seed = 42)
XGB_reg.fit(X_train,Y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=10, n_jobs=4,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [73]:
print("r2 for train data: ",XGB_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,XGB_reg),'\n')

print("r2 for test data: ",XGB_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,XGB_reg))

r2 for train data:  0.652694689158984
adjusted_r2 for train data:  0.6515189425959367 

r2 for test data:  0.5962695047356253
adjusted_r2 for test data:  0.5907455300829896


In [74]:
Table = insertData(Table,'XGBRegressor',adj_R2(X_train,Y_train,XGB_reg),adj_R2(X_test,Y_test,XGB_reg))
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,XGBRegressor,0.651519,0.590746,default
1,Gradient Boosting,0.617039,0.60275,"{n_estimators=50, max_leaf_nodes=5, max_depth=..."
2,Bag KNN Regressor,0.6715,0.514407,default
3,Bag Decision Tree Regressor,0.932593,0.519785,default
4,Bag Linear Regression,0.556782,0.572451,default
5,KNN Regressor,0.652048,0.499292,"{weights=uniform, algorithm=auto}"
6,Random Forest Regressor,0.638119,0.566961,"{min_samples_split=30, min_samples_leaf=10, ma..."
7,Decision Tree Regressor,0.638817,0.566889,"{max_depth=8, min_samples_leaf=10, min_samples..."
8,Ridge Regression,0.556786,0.572702,{alpha = 2.1}
9,Lasso Regression,0.55628,0.574002,{'alpha': '2.581021055193149'}


In [75]:
HyperParameters = {'n_estimators': [15,17,18,19,20,22,25,26,27,28,29,30],
                   'booster': ['gbtree'],
                   'max_depth': [3,4,5,6],
                   'min_child_weight': [74,75,78,79,80,81]
                  }

In [76]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

XGB_reg = RandomizedSearchCV(XGBRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

XGB_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=...
                                          predictor=None, random_state=None,
                                          reg_alpha=None, reg_lambda=None,
                                       

In [77]:
print(XGB_reg.best_score_)
print(XGB_reg.best_params_)

0.5924184532812771
{'n_estimators': 15, 'min_child_weight': 79, 'max_depth': 4, 'booster': 'gbtree'}


In [78]:
from xgboost import XGBRegressor

XGB_reg=XGBRegressor(objective ='reg:linear',n_estimators = 15, min_child_weight=79, booster='gbtree',seed = 42)
XGB_reg.fit(X_train,Y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=79, missing=nan,
             monotone_constraints='()', n_estimators=15, n_jobs=4,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [79]:
print("r2 for train data: ",XGB_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,XGB_reg),'\n')

print("r2 for test data: ",XGB_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,XGB_reg))

r2 for train data:  0.6272560938720815
adjusted_r2 for train data:  0.6259942290147158 

r2 for test data:  0.6123450772312957
adjusted_r2 for test data:  0.607041053897756


In [80]:
Table = insertData(Table,'XGBRegressor',adj_R2(X_train,Y_train,XGB_reg),adj_R2(X_test,Y_test,XGB_reg),[{'objective=reg:linear,n_estimators = 15, min_child_weight=79, booster=gbtree, seed = 42'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,XGBRegressor,0.625994,0.607041,"{objective=reg:linear,n_estimators = 15, min_c..."
1,XGBRegressor,0.651519,0.590746,default
2,Gradient Boosting,0.617039,0.60275,"{n_estimators=50, max_leaf_nodes=5, max_depth=..."
3,Bag KNN Regressor,0.6715,0.514407,default
4,Bag Decision Tree Regressor,0.932593,0.519785,default
5,Bag Linear Regression,0.556782,0.572451,default
6,KNN Regressor,0.652048,0.499292,"{weights=uniform, algorithm=auto}"
7,Random Forest Regressor,0.638119,0.566961,"{min_samples_split=30, min_samples_leaf=10, ma..."
8,Decision Tree Regressor,0.638817,0.566889,"{max_depth=8, min_samples_leaf=10, min_samples..."
9,Ridge Regression,0.556786,0.572702,{alpha = 2.1}
