In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
## To see all columns of the dataset

pd.set_option("display.max_columns", None)

In [3]:
Table = pd.DataFrame(columns=['Model_Name','Adjusted_R2_Train','Adjusted_R2_Test','Parameters'])
def insertData(Table, model_name, train_R2, test_R2, parameter='default'):
    temp = pd.DataFrame()
    temp['Model_Name'] = [model_name]
    temp['Train_R2'] = train_R2
    temp['Test_R2'] = test_R2
    temp['Parameters'] = parameter
    Table = pd.concat([Table,temp],ignore_index=True)
    
    return Table

In [4]:
train_df = pd.read_csv('Data_Set/preprocessed_data.csv')
train_df.head()

Unnamed: 0,Item_Weight,Item_MRP,Item_Outlet_Sales,Outlet_Age,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,9.3,249.8092,11.794838,23,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1,5.92,48.2692,6.223654,13,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0
2,17.5,141.618,9.919649,23,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
3,19.2,182.095,7.234707,24,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,8.93,53.8614,7.930641,35,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0


# Model Building

<ol>
    <li>Linear Regression</li>
    <li>Ridge Regression</li>
    <li>Lasso Regression</li>
    <li>Decision Tree Regressor</li>
    <li>Random Forest Regressor</li>
    <li>KNN Regressor</li>
    <li>Bagging</li>
    <li>Boosting</li>
    <li>Stacking</li>
</ol>

In [5]:
# Let's create a function to create adjusted R-Squared
def adj_R2(x,y,regression):
    r2 = regression.score(x,y)
    n = x.shape[0]
    p = x.shape[1]
    adjusted_r2 = 1-(1-r2)*(n-1)/(n-p-1)
    return adjusted_r2

In [6]:
# Standarization only the numerical freature using MinMaxScaler

from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()

df_standard = minmax.fit_transform(train_df[['Item_Weight','Item_MRP','Outlet_Age']])

In [7]:
df_standard

array([[0.28252456, 0.92750715, 0.41666667],
       [0.08127419, 0.0720684 , 0.        ],
       [0.77076511, 0.46828841, 0.41666667],
       ...,
       [0.35992855, 0.22849221, 0.20833333],
       [0.15808276, 0.30493925, 0.        ],
       [0.61000298, 0.18750976, 0.5       ]])

In [8]:
train_df['Item_Weight'] = df_standard[:,0]
train_df['Item_MRP'] = df_standard[:,1]
train_df['Outlet_Age'] = df_standard[:,2]

train_df.head()

Unnamed: 0,Item_Weight,Item_MRP,Item_Outlet_Sales,Outlet_Age,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
0,0.282525,0.927507,11.794838,0.416667,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
1,0.081274,0.072068,6.223654,0.0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0
2,0.770765,0.468288,9.919649,0.416667,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
3,0.871986,0.640093,7.234707,0.458333,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,0.260494,0.095805,7.930641,0.916667,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0


## X, y Split

In [9]:
X = train_df.drop(['Item_Outlet_Sales'],axis = 1)
Y = train_df['Item_Outlet_Sales']

## Train Test Split

In [10]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state = 42)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(6818, 35) (1705, 35) (6818,) (1705,)


In [11]:
X_train.head()

Unnamed: 0,Item_Weight,Item_MRP,Outlet_Age,Item_Fat_Content_Regular,Item_Type_Breads,Item_Type_Breakfast,Item_Type_Canned,Item_Type_Dairy,Item_Type_Frozen Foods,Item_Type_Fruits and Vegetables,Item_Type_Hard Drinks,Item_Type_Health and Hygiene,Item_Type_Household,Item_Type_Meat,Item_Type_Others,Item_Type_Seafood,Item_Type_Snack Foods,Item_Type_Soft Drinks,Item_Type_Starchy Foods,Outlet_Size_Medium,Outlet_Size_Small,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,Outlet_Identifier_OUT018,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049
549,0.294433,0.594464,0.416667,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
7757,0.800536,0.591057,0.291667,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
764,0.776719,0.341387,0.5,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
6867,0.224472,0.043819,0.291667,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
2716,0.493897,0.527478,0.5,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0


## Linear Regression

In [12]:
from sklearn.linear_model import LinearRegression

linear_reg = LinearRegression()
linear_reg.fit(X_train,Y_train)

LinearRegression()

In [13]:
print("R2 for train data: ",linear_reg.score(X_train,Y_train))
print("Adjusted R2 for train data: ",adj_R2(X_train,Y_train,linear_reg),'\n')

print("R2 for test data: ",linear_reg.score(X_test,Y_test))
print("Adjusted R2 for test data: ",adj_R2(X_test,Y_test,linear_reg))

R2 for train data:  0.685830236672488
Adjusted R2 for train data:  0.6842088946323136 

R2 for test data:  0.7020902766079125
Adjusted R2 for test data:  0.6958429187177249


In [14]:
#from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

model = LinearRegression()
scores_lr = cross_val_score(model, X=X_train , y=Y_train, cv=10)

print(scores_lr)
print(scores_lr.mean())

[0.67364688 0.70576877 0.68620165 0.66246603 0.6923625  0.66155226
 0.66821501 0.71358177 0.67483221 0.68092602]
0.6819553099988251


In [15]:
Table = insertData(Table, 'Linear Regression', adj_R2(X_train,Y_train,linear_reg), adj_R2(X_test,Y_test,linear_reg))
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default


## Lasso Regression

In [16]:
# Lasso Regularization
from sklearn.linear_model import LassoCV

lasscv = LassoCV(alphas=None,cv=10, max_iter=100000)
lasscv.fit(X_train,Y_train)

LassoCV(cv=10, max_iter=100000)

In [17]:
alpha = lasscv.alpha_
print(alpha)

0.0022585545248758956


In [18]:
#now that we have best parameter, let's use Lasso regression and see how well our data has fitted before
from sklearn.linear_model import Lasso

lasso_reg = Lasso(alpha)
lasso_reg.fit(X_train,Y_train)

Lasso(alpha=0.0022585545248758956)

In [19]:
print("r2 for train data: ",lasso_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,lasso_reg),'\n')

print("r2 for test data: ",lasso_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,lasso_reg))

r2 for train data:  0.6855485147771947
adjusted_r2 for train data:  0.6839257188493271 

r2 for test data:  0.7019855736747191
adjusted_r2 for test data:  0.6957360200968972


In [20]:
Table = insertData(Table, 'Lasso Regression', adj_R2(X_train,Y_train,lasso_reg), adj_R2(X_test,Y_test,lasso_reg), [{'alpha=0.0022585545248758956'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}


## Ridge Regression

In [21]:
# Ridge Regularization
from sklearn.linear_model import RidgeCV

#alphas = [0.01,0.1,1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2]
alphas = [2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3, 3.1, 3.2]
ridgecv = RidgeCV(alphas, cv =10)
ridgecv.fit(X_train,Y_train)

RidgeCV(alphas=array([2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3. , 3.1, 3.2]),
        cv=10)

In [22]:
alpha = ridgecv.alpha_
print(alpha)

3.2


In [23]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=alpha)
ridge_reg.fit(X_train,Y_train)

Ridge(alpha=3.2)

In [24]:
print("r2 for train data: ",ridge_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,ridge_reg),'\n')

print("r2 for test data: ",ridge_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,ridge_reg))

r2 for train data:  0.6858078372098483
adjusted_r2 for train data:  0.6841863795723291 

r2 for test data:  0.7020617227721262
adjusted_r2 for test data:  0.6958137660896963


In [25]:
Table = insertData(Table, 'Ridge Regression', adj_R2(X_train,Y_train,ridge_reg), adj_R2(X_test,Y_test,ridge_reg), [{'alpha = 3.2'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}


## Decision Tree Regressor

In [26]:
HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
                   'max_depth':[8,10,12,14,16,18],
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10]}

In [27]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# decision_tree_reg = RandomizedSearchCV(DecisionTreeRegressor(), param_distributions = HyperParameters, cv = 5,
#                             n_iter = 100, n_jobs = -1, verbose = 2,random_state = 42)
decision_tree_reg = GridSearchCV(DecisionTreeRegressor(), param_grid = HyperParameters, cv = 5,
                                 n_jobs = -1, verbose = 3)

decision_tree_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 810 candidates, totalling 4050 fits


GridSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': [8, 10, 12, 14, 16, 18],
                         'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'min_samples_split': [2, 4, 6, 8, 10, 12, 14, 16, 18,
                                               20, 22, 24, 26, 28, 30]},
             verbose=3)

In [28]:
print(decision_tree_reg.best_score_)
print(decision_tree_reg.best_params_)

0.6830342494184605
{'max_depth': 8, 'min_samples_leaf': 10, 'min_samples_split': 30}


In [29]:
dt_reg = DecisionTreeRegressor(criterion='mse', max_depth=8, min_samples_leaf=10, min_samples_split=30)
dt_reg.fit(X_train,Y_train)

DecisionTreeRegressor(max_depth=8, min_samples_leaf=10, min_samples_split=30)

In [30]:
print("r2 for train data: ",dt_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,dt_reg),'\n')

print("r2 for test data: ",dt_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,dt_reg))

r2 for train data:  0.7210839123321244
adjusted_r2 for train data:  0.7196445046251979 

r2 for test data:  0.7010742521825799
adjusted_r2 for test data:  0.6948055876088174


In [31]:
## Cross Validation
scores_lm = cross_val_score(dt_reg, X = X_train, y = Y_train, cv = 10, scoring='r2')

print(scores_lm)
print(scores_lm.mean())

[0.66996371 0.70773565 0.68739917 0.66477235 0.69172337 0.65862226
 0.67171921 0.71214815 0.67030829 0.67817967]
0.6812571825785936


In [32]:
Table = insertData(Table, 'Decision Tree Regressor', adj_R2(X_train,Y_train,dt_reg), adj_R2(X_test,Y_test,dt_reg), [{'max_depth=8, min_samples_leaf=10, min_samples_split=28'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}
3,Decision Tree Regressor,0.719645,0.694806,"{max_depth=8, min_samples_leaf=10, min_samples..."


## Random Forest Regressor

In [33]:
HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
                   'max_depth':[8,10,12,14,16,18],
                   'min_samples_leaf':[2,3,4,5,6,7,8,9,10],
                   'max_features':["auto", "sqrt", "log2"]}

In [34]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

random_forest_reg = RandomizedSearchCV(RandomForestRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

random_forest_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'max_depth': [8, 10, 12, 14, 16, 18],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 3, 4, 5, 6, 7,
                                                             8, 9, 10],
                                        'min_samples_split': [2, 4, 6, 8, 10,
                                                              12, 14, 16, 18,
                                                              20, 22, 24, 26,
                                                              28, 30]},
                   random_state=42, verbose=3)

In [35]:
print(random_forest_reg.best_score_)
print(random_forest_reg.best_params_)

0.6957563026537532
{'min_samples_split': 24, 'min_samples_leaf': 5, 'max_features': 'auto', 'max_depth': 8}


In [36]:
rf_reg = DecisionTreeRegressor(min_samples_split=24, min_samples_leaf=5, max_features='auto', max_depth=8)
rf_reg.fit(X_train,Y_train)

DecisionTreeRegressor(max_depth=8, max_features='auto', min_samples_leaf=5,
                      min_samples_split=24)

In [37]:
print("r2 for train data: ",rf_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,rf_reg),'\n')

print("r2 for test data: ",rf_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,rf_reg))

r2 for train data:  0.7229843930081478
adjusted_r2 for train data:  0.7215547931490037 

r2 for test data:  0.6968633654605793
adjusted_r2 for test data:  0.6905063958926465


In [38]:
## Cross Validation
scores_lm = cross_val_score(rf_reg, X = X_train, y = Y_train, cv = 10, scoring='r2')

print(scores_lm)
print(scores_lm.mean())

[0.67448713 0.70603702 0.68864626 0.65867054 0.68649376 0.6607357
 0.67426436 0.71101036 0.66710333 0.6757814 ]
0.6803229858467533


In [39]:
Table = insertData(Table,'Random Forest Regressor',adj_R2(X_train,Y_train,rf_reg),adj_R2(X_test,Y_test,rf_reg),[{'min_samples_split=24, min_samples_leaf=5, max_features=auto, max_depth=8'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}
3,Decision Tree Regressor,0.719645,0.694806,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.721555,0.690506,"{min_samples_split=24, min_samples_leaf=5, max..."


## KNN Regressor

In [43]:
HyperParameters = {'weights':['uniform','distance'],
                   'algorithm':['auto','ball_tree','kd_tree','brute'],
                   'leaf_size':[20,25,30,35,40,45,50]}

# HyperParameters = {'weights':['uniform','distance'],
#                    'algorithm':['ball_tree','kd_tree'],
#                    'leaf_size':[20,25,30,35,40,45,50]}

In [44]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import RandomizedSearchCV

knn_reg = RandomizedSearchCV(KNeighborsRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

knn_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 56 candidates, totalling 280 fits


RandomizedSearchCV(cv=5, estimator=KNeighborsRegressor(), n_iter=100, n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'leaf_size': [20, 25, 30, 35, 40, 45,
                                                      50],
                                        'weights': ['uniform', 'distance']},
                   random_state=42, verbose=3)

In [45]:
print(knn_reg.best_score_)
print(knn_reg.best_params_)

0.5771266402300765
{'weights': 'uniform', 'leaf_size': 20, 'algorithm': 'auto'}


In [46]:
knn_reg = KNeighborsRegressor(weights='uniform', algorithm='auto',leaf_size=20)
knn_reg.fit(X_train,Y_train)

KNeighborsRegressor(leaf_size=20)

In [47]:
print("r2 for train data: ",knn_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,knn_reg),'\n')

print("r2 for test data: ",knn_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,knn_reg))

r2 for train data:  0.7285569158294909
adjusted_r2 for train data:  0.7271560741978236 

r2 for test data:  0.6168388439635797
adjusted_r2 for test data:  0.6088037088759375


In [48]:
Table = insertData(Table,'KNN Regressor',adj_R2(X_train,Y_train,knn_reg),adj_R2(X_test,Y_test,knn_reg),[{'weights=uniform, algorithm=auto, leaf_size=20'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}
3,Decision Tree Regressor,0.719645,0.694806,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.721555,0.690506,"{min_samples_split=24, min_samples_leaf=5, max..."
5,KNN Regressor,0.727156,0.608804,"{weights=uniform, algorithm=auto, leaf_size=20}"


## Bagging Regressor

In [49]:
# Linear Regression
from sklearn.ensemble import BaggingRegressor

lr_bagging = BaggingRegressor(base_estimator=LinearRegression(), n_estimators=100, bootstrap=True,verbose=2,n_jobs=-1)
lr_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.7s finished


BaggingRegressor(base_estimator=LinearRegression(), n_estimators=100, n_jobs=-1,
                 verbose=2)

In [50]:
print("r2 for train data: ",lr_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,lr_bagging),'\n')

print("r2 for test data: ",lr_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,lr_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for train data:  0.685805062601473


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


adjusted_r2 for train data:  0.684183590644978 

r2 for test data:  0.7019648710447843
adjusted_r2 for test data:  0.6957148833195401


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


In [51]:
Table = insertData(Table,'Bag Linear Regression',adj_R2(X_train,Y_train,lr_bagging),adj_R2(X_test,Y_test,lr_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.0s finished


Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}
3,Decision Tree Regressor,0.719645,0.694806,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.721555,0.690506,"{min_samples_split=24, min_samples_leaf=5, max..."
5,KNN Regressor,0.727156,0.608804,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.684184,0.695715,default


In [52]:
# Decision Tree Regressor
from sklearn.ensemble import BaggingRegressor

dt_bagging = BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100, bootstrap=True,verbose=2, n_jobs=-1)
dt_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    1.6s finished


BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=100,
                 n_jobs=-1, verbose=2)

In [53]:
print("r2 for train data: ",dt_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,dt_bagging),'\n')

print("r2 for test data: ",dt_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,dt_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for train data:  0.9521944676710068


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


adjusted_r2 for train data:  0.9519477567256345 



[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for test data:  0.6699072089184618
adjusted_r2 for test data:  0.6629849514661827


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.2s finished


In [54]:
Table = insertData(Table,'Bag Decision Tree Regressor',adj_R2(X_train,Y_train,dt_bagging),adj_R2(X_test,Y_test,dt_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.3s finished


Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}
3,Decision Tree Regressor,0.719645,0.694806,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.721555,0.690506,"{min_samples_split=24, min_samples_leaf=5, max..."
5,KNN Regressor,0.727156,0.608804,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.684184,0.695715,default
7,Bag Decision Tree Regressor,0.951948,0.662985,default


In [116]:
from sklearn.svm import SVR

In [117]:
# # Decision Tree Regressor
# from sklearn.ensemble import BaggingRegressor

# svr_bagging = BaggingRegressor(base_estimator=SVR(), n_estimators=100, bootstrap=True,verbose=2)
# svr_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Building estimator 1 of 100 for this parallel run (total 100)...
Building estimator 2 of 100 for this parallel run (total 100)...
Building estimator 3 of 100 for this parallel run (total 100)...
Building estimator 4 of 100 for this parallel run (total 100)...
Building estimator 5 of 100 for this parallel run (total 100)...
Building estimator 6 of 100 for this parallel run (total 100)...
Building estimator 7 of 100 for this parallel run (total 100)...
Building estimator 8 of 100 for this parallel run (total 100)...
Building estimator 9 of 100 for this parallel run (total 100)...
Building estimator 10 of 100 for this parallel run (total 100)...
Building estimator 11 of 100 for this parallel run (total 100)...
Building estimator 12 of 100 for this parallel run (total 100)...
Building estimator 13 of 100 for this parallel run (total 100)...
Building estimator 14 of 100 for this parallel run (total 100)...
Building estimator 15 of 100 for this parallel run (total 100)...
Building estimator 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.3min finished


BaggingRegressor(base_estimator=SVR(), n_estimators=100, verbose=2)

In [118]:
# print("r2 for train data: ",svr_bagging.score(X_train,Y_train))
# print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,svr_bagging),'\n')

# print("r2 for test data: ",svr_bagging.score(X_test,Y_test))
# print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,svr_bagging))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.7min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.7min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


r2 for train data:  -0.007774727114581603


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


adjusted_r2 for train data:  -0.01118638721520493 



[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


r2 for test data:  0.01819860776469384
adjusted_r2 for test data:  0.0047652752118014785


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.5min finished


In [55]:
# KNN Regressor
from sklearn.ensemble import BaggingRegressor

knn_bagging = BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100, bootstrap=True,verbose=2,n_jobs=-1)
knn_bagging.fit(X_train,Y_train)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:    0.9s finished


BaggingRegressor(base_estimator=KNeighborsRegressor(), n_estimators=100,
                 n_jobs=-1, verbose=2)

In [56]:
print("r2 for train data: ",knn_bagging.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,knn_bagging),'\n')

print("r2 for test data: ",knn_bagging.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,knn_bagging))

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   47.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


r2 for train data:  0.7437224772745866


[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   49.8s finished


adjusted_r2 for train data:  0.7423999008523823 



[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   13.5s finished


r2 for test data:  0.6260578478095875


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   13.4s finished


adjusted_r2 for test data:  0.6182160411429222


In [57]:
Table = insertData(Table,'Bag KNN Regressor',adj_R2(X_train,Y_train,knn_bagging),adj_R2(X_test,Y_test,knn_bagging))
Table

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   48.3s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   13.0s finished


Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}
3,Decision Tree Regressor,0.719645,0.694806,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.721555,0.690506,"{min_samples_split=24, min_samples_leaf=5, max..."
5,KNN Regressor,0.727156,0.608804,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.684184,0.695715,default
7,Bag Decision Tree Regressor,0.951948,0.662985,default
8,Bag KNN Regressor,0.7424,0.618216,default


## Gradient Boosting

In [58]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boost_reg = GradientBoostingRegressor(random_state=42)
gradient_boost_reg.fit(X_train,Y_train)

GradientBoostingRegressor(random_state=42)

In [59]:
print("r2 for train data: ",gradient_boost_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,gradient_boost_reg),'\n')

print("r2 for test data: ",gradient_boost_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,gradient_boost_reg))

r2 for train data:  0.720284140491275
adjusted_r2 for train data:  0.7188406053861724 

r2 for test data:  0.7125091641642904
adjusted_r2 for test data:  0.7064802970257346


In [60]:
Table = insertData(Table,'Gradient Boosting',adj_R2(X_train,Y_train,gradient_boost_reg),adj_R2(X_test,Y_test,gradient_boost_reg))
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}
3,Decision Tree Regressor,0.719645,0.694806,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.721555,0.690506,"{min_samples_split=24, min_samples_leaf=5, max..."
5,KNN Regressor,0.727156,0.608804,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.684184,0.695715,default
7,Bag Decision Tree Regressor,0.951948,0.662985,default
8,Bag KNN Regressor,0.7424,0.618216,default
9,Gradient Boosting,0.718841,0.70648,default


In [61]:
HyperParameters = {'learning_rate':[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
                   'n_estimators':[50, 100, 150, 200, 250, 300, 350, 400, 450, 500],
                   'max_leaf_nodes':[2, 5, 10, 20, 50, 100],
                   'max_depth':[8,10,12,14,16,18]
                  }

# HyperParameters = {'min_samples_split':[2,4,6,8,10,12,14,16,18,20,22,24,26,28,30],
#                    'max_depth':[8,10,12,14,16,18],
#                    }

# param_distributions = {
#     "n_estimators": [1, 2, 5, 10, 20, 50, 100, 200, 500],
#     "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
#     "learning_rate": loguniform(0.01, 1),
# }

In [62]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

gradient_boost_reg = RandomizedSearchCV(GradientBoostingRegressor(random_state=0), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

gradient_boost_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=0),
                   n_iter=100, n_jobs=-1,
                   param_distributions={'learning_rate': [0.1, 0.2, 0.3, 0.4,
                                                          0.5, 0.6, 0.7, 0.8,
                                                          0.9],
                                        'max_depth': [8, 10, 12, 14, 16, 18],
                                        'max_leaf_nodes': [2, 5, 10, 20, 50,
                                                           100],
                                        'n_estimators': [50, 100, 150, 200, 250,
                                                         300, 350, 400, 450,
                                                         500]},
                   random_state=42, verbose=3)

In [63]:
print(gradient_boost_reg.best_score_)
print(gradient_boost_reg.best_params_)

0.6953966130715141
{'n_estimators': 150, 'max_leaf_nodes': 5, 'max_depth': 14, 'learning_rate': 0.1}


In [64]:
from sklearn.ensemble import GradientBoostingRegressor

gradient_boost_reg = GradientBoostingRegressor(n_estimators=150, max_leaf_nodes=5, max_depth=14, learning_rate=0.1)
gradient_boost_reg.fit(X_train,Y_train)

GradientBoostingRegressor(max_depth=14, max_leaf_nodes=5, n_estimators=150)

In [65]:
print("r2 for train data: ",gradient_boost_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,gradient_boost_reg),'\n')

print("r2 for test data: ",gradient_boost_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,gradient_boost_reg))

r2 for train data:  0.717698904705356
adjusted_r2 for train data:  0.7162420279233872 

r2 for test data:  0.7131481659604437
adjusted_r2 for test data:  0.7071326990992188


In [66]:
Table = insertData(Table,'Gradient Boosting',adj_R2(X_train,Y_train,gradient_boost_reg),adj_R2(X_test,Y_test,gradient_boost_reg),[{'n_estimators=150, max_leaf_nodes=5, max_depth=14, learning_rate=0.2'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}
3,Decision Tree Regressor,0.719645,0.694806,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.721555,0.690506,"{min_samples_split=24, min_samples_leaf=5, max..."
5,KNN Regressor,0.727156,0.608804,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.684184,0.695715,default
7,Bag Decision Tree Regressor,0.951948,0.662985,default
8,Bag KNN Regressor,0.7424,0.618216,default
9,Gradient Boosting,0.718841,0.70648,default


## XGBRegressor

In [67]:
from xgboost import XGBRegressor

XGB_reg=XGBRegressor(objective ='reg:linear',n_estimators = 10, seed = 42)
XGB_reg.fit(X_train,Y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=10, n_jobs=4,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [68]:
print("r2 for train data: ",XGB_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,XGB_reg),'\n')

print("r2 for test data: ",XGB_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,XGB_reg))

r2 for train data:  0.715825187327479
adjusted_r2 for train data:  0.7143586408156037 

r2 for test data:  0.705057269935518
adjusted_r2 for test data:  0.698872131797557


In [69]:
Table = insertData(Table,'XGBRegressor',adj_R2(X_train,Y_train,XGB_reg),adj_R2(X_test,Y_test,XGB_reg))
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}
3,Decision Tree Regressor,0.719645,0.694806,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.721555,0.690506,"{min_samples_split=24, min_samples_leaf=5, max..."
5,KNN Regressor,0.727156,0.608804,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.684184,0.695715,default
7,Bag Decision Tree Regressor,0.951948,0.662985,default
8,Bag KNN Regressor,0.7424,0.618216,default
9,Gradient Boosting,0.718841,0.70648,default


In [70]:
HyperParameters = {'n_estimators': [15,17,18,19,20,22,25,26,27,28,29,30],
                   'booster': ['gbtree'],
                   'max_depth': [3,4,5,6],
                   'min_child_weight': [74,75,78,79,80,81]
                  }

In [71]:
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

XGB_reg = RandomizedSearchCV(XGBRegressor(), param_distributions = HyperParameters, cv = 5,
                            n_iter = 100, n_jobs = -1, verbose = 3, return_train_score=False, random_state = 42)

XGB_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


RandomizedSearchCV(cv=5,
                   estimator=XGBRegressor(base_score=None, booster=None,
                                          colsample_bylevel=None,
                                          colsample_bynode=None,
                                          colsample_bytree=None,
                                          enable_categorical=False, gamma=None,
                                          gpu_id=None, importance_type=None,
                                          interaction_constraints=None,
                                          learning_rate=None,
                                          max_delta_step=None, max_depth=None,
                                          min_child_weight=None, missing=nan,
                                          monotone_constraints=...
                                          predictor=None, random_state=None,
                                          reg_alpha=None, reg_lambda=None,
                                       

In [72]:
print(XGB_reg.best_score_)
print(XGB_reg.best_params_)

0.6961115085701515
{'n_estimators': 20, 'min_child_weight': 75, 'max_depth': 3, 'booster': 'gbtree'}


In [73]:
from xgboost import XGBRegressor

XGB_reg=XGBRegressor(objective ='reg:linear',n_estimators = 20, min_child_weight=75, max_depth=3, booster='gbtree',seed = 42)
XGB_reg.fit(X_train,Y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=3, min_child_weight=75, missing=nan,
             monotone_constraints='()', n_estimators=20, n_jobs=4,
             num_parallel_tree=1, objective='reg:linear', predictor='auto',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=42, subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [74]:
print("r2 for train data: ",XGB_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,XGB_reg),'\n')

print("r2 for test data: ",XGB_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,XGB_reg))

r2 for train data:  0.7094794621365137
adjusted_r2 for train data:  0.7079801671165753 

r2 for test data:  0.7111791130801197
adjusted_r2 for test data:  0.7051223539176297


In [75]:
Table = insertData(Table,'XGBRegressor',adj_R2(X_train,Y_train,XGB_reg),adj_R2(X_test,Y_test,XGB_reg),[{'objective=reg:linear,n_estimators = 20, min_child_weight=75, booster=gbtree, seed = 42'}])
Table

Unnamed: 0,Model_Name,Train_R2,Test_R2,Parameters
0,Linear Regression,0.684209,0.695843,default
1,Lasso Regression,0.683926,0.695736,{alpha=0.0022585545248758956}
2,Ridge Regression,0.684186,0.695814,{alpha = 3.2}
3,Decision Tree Regressor,0.719645,0.694806,"{max_depth=8, min_samples_leaf=10, min_samples..."
4,Random Forest Regressor,0.721555,0.690506,"{min_samples_split=24, min_samples_leaf=5, max..."
5,KNN Regressor,0.727156,0.608804,"{weights=uniform, algorithm=auto, leaf_size=20}"
6,Bag Linear Regression,0.684184,0.695715,default
7,Bag Decision Tree Regressor,0.951948,0.662985,default
8,Bag KNN Regressor,0.7424,0.618216,default
9,Gradient Boosting,0.718841,0.70648,default


## Cat Boosting

In [12]:
from catboost import CatBoostRegressor

cat_reg = CatBoostRegressor()
cat_reg.fit(X_train,Y_train)

Learning rate set to 0.055449
0:	learn: 2.4435235	total: 147ms	remaining: 2m 26s
1:	learn: 2.3663057	total: 150ms	remaining: 1m 14s
2:	learn: 2.2942072	total: 152ms	remaining: 50.5s
3:	learn: 2.2208216	total: 154ms	remaining: 38.4s
4:	learn: 2.1555987	total: 157ms	remaining: 31.2s
5:	learn: 2.0926166	total: 159ms	remaining: 26.3s
6:	learn: 2.0332367	total: 161ms	remaining: 22.9s
7:	learn: 1.9798803	total: 164ms	remaining: 20.3s
8:	learn: 1.9303796	total: 166ms	remaining: 18.3s
9:	learn: 1.8842614	total: 169ms	remaining: 16.7s
10:	learn: 1.8418591	total: 172ms	remaining: 15.4s
11:	learn: 1.8034550	total: 175ms	remaining: 14.4s
12:	learn: 1.7700293	total: 178ms	remaining: 13.5s
13:	learn: 1.7425673	total: 180ms	remaining: 12.7s
14:	learn: 1.7101794	total: 184ms	remaining: 12.1s
15:	learn: 1.6843277	total: 187ms	remaining: 11.5s
16:	learn: 1.6577253	total: 189ms	remaining: 10.9s
17:	learn: 1.6378388	total: 192ms	remaining: 10.4s
18:	learn: 1.6191444	total: 195ms	remaining: 10s
19:	learn: 

217:	learn: 1.3229912	total: 686ms	remaining: 2.46s
218:	learn: 1.3226256	total: 689ms	remaining: 2.46s
219:	learn: 1.3223682	total: 692ms	remaining: 2.45s
220:	learn: 1.3219169	total: 694ms	remaining: 2.45s
221:	learn: 1.3216373	total: 697ms	remaining: 2.44s
222:	learn: 1.3213345	total: 700ms	remaining: 2.44s
223:	learn: 1.3210815	total: 703ms	remaining: 2.43s
224:	learn: 1.3208904	total: 705ms	remaining: 2.43s
225:	learn: 1.3206143	total: 708ms	remaining: 2.43s
226:	learn: 1.3204287	total: 711ms	remaining: 2.42s
227:	learn: 1.3198990	total: 715ms	remaining: 2.42s
228:	learn: 1.3194387	total: 717ms	remaining: 2.41s
229:	learn: 1.3191243	total: 719ms	remaining: 2.41s
230:	learn: 1.3189149	total: 722ms	remaining: 2.4s
231:	learn: 1.3186013	total: 724ms	remaining: 2.4s
232:	learn: 1.3183737	total: 726ms	remaining: 2.39s
233:	learn: 1.3180491	total: 729ms	remaining: 2.38s
234:	learn: 1.3176612	total: 731ms	remaining: 2.38s
235:	learn: 1.3174567	total: 734ms	remaining: 2.38s
236:	learn: 1.

424:	learn: 1.2648277	total: 1.21s	remaining: 1.63s
425:	learn: 1.2646611	total: 1.21s	remaining: 1.63s
426:	learn: 1.2643859	total: 1.21s	remaining: 1.63s
427:	learn: 1.2642562	total: 1.21s	remaining: 1.62s
428:	learn: 1.2638438	total: 1.22s	remaining: 1.62s
429:	learn: 1.2634695	total: 1.22s	remaining: 1.62s
430:	learn: 1.2632638	total: 1.22s	remaining: 1.61s
431:	learn: 1.2630134	total: 1.23s	remaining: 1.61s
432:	learn: 1.2625851	total: 1.23s	remaining: 1.61s
433:	learn: 1.2622702	total: 1.23s	remaining: 1.6s
434:	learn: 1.2619620	total: 1.23s	remaining: 1.6s
435:	learn: 1.2617290	total: 1.24s	remaining: 1.6s
436:	learn: 1.2614431	total: 1.24s	remaining: 1.6s
437:	learn: 1.2611291	total: 1.24s	remaining: 1.59s
438:	learn: 1.2608949	total: 1.24s	remaining: 1.59s
439:	learn: 1.2607107	total: 1.25s	remaining: 1.58s
440:	learn: 1.2605039	total: 1.25s	remaining: 1.58s
441:	learn: 1.2602453	total: 1.25s	remaining: 1.58s
442:	learn: 1.2599830	total: 1.25s	remaining: 1.58s
443:	learn: 1.25

635:	learn: 1.2166088	total: 1.72s	remaining: 986ms
636:	learn: 1.2164485	total: 1.73s	remaining: 984ms
637:	learn: 1.2161743	total: 1.73s	remaining: 981ms
638:	learn: 1.2159032	total: 1.73s	remaining: 978ms
639:	learn: 1.2156615	total: 1.73s	remaining: 976ms
640:	learn: 1.2154702	total: 1.74s	remaining: 973ms
641:	learn: 1.2153000	total: 1.74s	remaining: 970ms
642:	learn: 1.2150494	total: 1.74s	remaining: 968ms
643:	learn: 1.2147737	total: 1.75s	remaining: 965ms
644:	learn: 1.2145762	total: 1.75s	remaining: 962ms
645:	learn: 1.2142534	total: 1.75s	remaining: 959ms
646:	learn: 1.2141199	total: 1.75s	remaining: 957ms
647:	learn: 1.2140099	total: 1.75s	remaining: 954ms
648:	learn: 1.2137385	total: 1.76s	remaining: 951ms
649:	learn: 1.2134946	total: 1.76s	remaining: 948ms
650:	learn: 1.2133595	total: 1.76s	remaining: 945ms
651:	learn: 1.2131835	total: 1.76s	remaining: 942ms
652:	learn: 1.2130096	total: 1.77s	remaining: 939ms
653:	learn: 1.2127380	total: 1.77s	remaining: 937ms
654:	learn: 

846:	learn: 1.1762787	total: 2.24s	remaining: 405ms
847:	learn: 1.1760818	total: 2.25s	remaining: 403ms
848:	learn: 1.1759243	total: 2.25s	remaining: 400ms
849:	learn: 1.1756982	total: 2.25s	remaining: 397ms
850:	learn: 1.1756285	total: 2.25s	remaining: 395ms
851:	learn: 1.1753658	total: 2.26s	remaining: 392ms
852:	learn: 1.1752456	total: 2.26s	remaining: 389ms
853:	learn: 1.1750339	total: 2.26s	remaining: 387ms
854:	learn: 1.1747939	total: 2.27s	remaining: 384ms
855:	learn: 1.1746314	total: 2.27s	remaining: 382ms
856:	learn: 1.1745038	total: 2.27s	remaining: 379ms
857:	learn: 1.1744364	total: 2.27s	remaining: 376ms
858:	learn: 1.1742864	total: 2.27s	remaining: 374ms
859:	learn: 1.1739928	total: 2.28s	remaining: 371ms
860:	learn: 1.1738201	total: 2.28s	remaining: 368ms
861:	learn: 1.1736314	total: 2.28s	remaining: 365ms
862:	learn: 1.1734956	total: 2.29s	remaining: 363ms
863:	learn: 1.1733016	total: 2.29s	remaining: 360ms
864:	learn: 1.1731271	total: 2.29s	remaining: 357ms
865:	learn: 

<catboost.core.CatBoostRegressor at 0x13b3e36f7c0>

In [13]:
print("r2 for train data: ",cat_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,cat_reg),'\n')

print("r2 for test data: ",cat_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,cat_reg))

NameError: name 'cb_reg' is not defined

In [14]:
HyperParameters = {'depth':[2,3,4,5,6,7],
                   'iterations':[145,150,155,156,157],
                   'learning_rate':[0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6]}

In [15]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

cat_reg = RandomizedSearchCV(CatBoostRegressor(), param_distributions=HyperParameters, n_iter=100, cv=5, n_jobs=-1, verbose=2, 
                            return_train_score=False)

cat_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
0:	learn: 2.4683528	total: 1.82ms	remaining: 263ms
1:	learn: 2.4095951	total: 3.83ms	remaining: 274ms
2:	learn: 2.3559747	total: 5.6ms	remaining: 265ms
3:	learn: 2.3079886	total: 7.32ms	remaining: 258ms
4:	learn: 2.2594982	total: 9.12ms	remaining: 255ms
5:	learn: 2.2180626	total: 11ms	remaining: 256ms
6:	learn: 2.1759596	total: 13ms	remaining: 256ms
7:	learn: 2.1389365	total: 15.1ms	remaining: 258ms
8:	learn: 2.1024513	total: 17.3ms	remaining: 261ms
9:	learn: 2.0489459	total: 19.2ms	remaining: 260ms
10:	learn: 2.0174997	total: 21.1ms	remaining: 258ms
11:	learn: 1.9891718	total: 23.1ms	remaining: 256ms
12:	learn: 1.9622496	total: 25.1ms	remaining: 254ms
13:	learn: 1.9405814	total: 27ms	remaining: 253ms
14:	learn: 1.9152994	total: 29.1ms	remaining: 252ms
15:	learn: 1.8920844	total: 31ms	remaining: 250ms
16:	learn: 1.8713261	total: 33.2ms	remaining: 250ms
17:	learn: 1.8533299	total: 35.2ms	remaining: 248ms
18:	learn: 1.8179946

RandomizedSearchCV(cv=5,
                   estimator=<catboost.core.CatBoostRegressor object at 0x0000013B3F3ABF70>,
                   n_iter=100, n_jobs=-1,
                   param_distributions={'depth': [2, 3, 4, 5, 6, 7],
                                        'iterations': [145, 150, 155, 156, 157],
                                        'learning_rate': [0.01, 0.05, 0.1, 0.2,
                                                          0.3, 0.4, 0.5, 0.6]},
                   verbose=2)

In [17]:
print(cat_reg.best_score_)
print(cat_reg.best_params_)

0.7003711966985162
{'learning_rate': 0.05, 'iterations': 145, 'depth': 4}


In [18]:
from catboost import CatBoostRegressor

cat_reg = CatBoostRegressor(learning_rate=0.05, iterations=145, depth=4)
cat_reg.fit(X_train,Y_train)

0:	learn: 2.4683528	total: 5.94ms	remaining: 855ms
1:	learn: 2.4095951	total: 9.67ms	remaining: 691ms
2:	learn: 2.3559747	total: 12.5ms	remaining: 593ms
3:	learn: 2.3079886	total: 15.1ms	remaining: 531ms
4:	learn: 2.2594982	total: 17.3ms	remaining: 484ms
5:	learn: 2.2180626	total: 19.3ms	remaining: 448ms
6:	learn: 2.1759596	total: 21.5ms	remaining: 423ms
7:	learn: 2.1389365	total: 23.3ms	remaining: 400ms
8:	learn: 2.1024513	total: 25.1ms	remaining: 379ms
9:	learn: 2.0489459	total: 26.6ms	remaining: 360ms
10:	learn: 2.0174997	total: 28.2ms	remaining: 344ms
11:	learn: 1.9891718	total: 29.7ms	remaining: 330ms
12:	learn: 1.9622496	total: 31.3ms	remaining: 318ms
13:	learn: 1.9405814	total: 33ms	remaining: 309ms
14:	learn: 1.9152994	total: 34.6ms	remaining: 300ms
15:	learn: 1.8920844	total: 36.3ms	remaining: 293ms
16:	learn: 1.8713261	total: 37.9ms	remaining: 285ms
17:	learn: 1.8533299	total: 39.4ms	remaining: 278ms
18:	learn: 1.8179946	total: 41ms	remaining: 272ms
19:	learn: 1.7996226	total

<catboost.core.CatBoostRegressor at 0x13b3f4dea30>

In [19]:
print("r2 for train data: ",cat_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,cat_reg),'\n')

print("r2 for test data: ",cat_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,cat_reg))

r2 for train data:  0.7080290092137664
adjusted_r2 for train data:  0.7065222288130708 

r2 for test data:  0.7171759039615052
adjusted_r2 for test data:  0.7112449013483553


# -----------------------------------------------------------------------------------------------------------------

In [13]:
df = pd.read_csv('Data_Set/preprocessed_data_cat_boost.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Age
0,FDA15,9.3,Low Fat,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,11.794838,23
1,DRC01,5.92,Regular,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,6.223654,13
2,FDN15,17.5,Low Fat,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,9.919649,23
3,FDX07,19.2,Regular,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,7.234707,24
4,NCD19,8.93,Low Fat,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,7.930641,35


In [14]:
## Drop Item_Identifier, Outlet_Establishment_Year

df.drop(columns=['Outlet_Establishment_Year','Item_Identifier'],axis = 1,inplace=True)
df.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,Outlet_Age
0,9.3,Low Fat,Dairy,249.8092,OUT049,Medium,Tier 1,Supermarket Type1,11.794838,23
1,5.92,Regular,Soft Drinks,48.2692,OUT018,Medium,Tier 3,Supermarket Type2,6.223654,13
2,17.5,Low Fat,Meat,141.618,OUT049,Medium,Tier 1,Supermarket Type1,9.919649,23
3,19.2,Regular,Fruits and Vegetables,182.095,OUT010,Small,Tier 3,Grocery Store,7.234707,24
4,8.93,Low Fat,Household,53.8614,OUT013,High,Tier 3,Supermarket Type1,7.930641,35


In [16]:
X = df.drop('Item_Outlet_Sales', axis = 1)
Y = df['Item_Outlet_Sales']

In [17]:
# Label encoding of categorical variables
cat_var = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

for i in cat_var:
    X[i] = X[i].astype('category').cat.codes

In [18]:
X.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Size,Outlet_Location_Type,Outlet_Type,Outlet_Age
0,9.3,1,4,249.8092,9,1,0,1,23
1,5.92,2,14,48.2692,3,1,2,2,13
2,17.5,1,10,141.618,9,1,0,1,23
3,19.2,2,6,182.095,0,2,2,0,24
4,8.93,1,9,53.8614,1,0,2,1,35


In [19]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state = 42)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

(6818, 9) (1705, 9) (6818,) (1705,)


In [23]:
HyperParameters = {'depth':[2,3,4,5,6,7],
                   'iterations':[145,150,155,156,157],
                   'learning_rate':[0.05,0.045,0.054,0.055,0.056,0.057,0.058]}

In [21]:
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

cb_reg = RandomizedSearchCV(CatBoostRegressor(), param_distributions=HyperParameters, n_iter=100, cv=5, n_jobs=-1, verbose=2, 
                            return_train_score=False)

cb_reg.fit(X_train, Y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
0:	learn: 2.4475039	total: 146ms	remaining: 22.7s
1:	learn: 2.3690989	total: 150ms	remaining: 11.5s
2:	learn: 2.2962242	total: 154ms	remaining: 7.84s
3:	learn: 2.2342050	total: 157ms	remaining: 5.97s
4:	learn: 2.1713117	total: 160ms	remaining: 4.84s
5:	learn: 2.1136857	total: 163ms	remaining: 4.09s
6:	learn: 2.0570778	total: 167ms	remaining: 3.55s
7:	learn: 2.0063955	total: 170ms	remaining: 3.14s
8:	learn: 1.9650678	total: 173ms	remaining: 2.83s
9:	learn: 1.9221352	total: 177ms	remaining: 2.58s
10:	learn: 1.8804031	total: 181ms	remaining: 2.38s
11:	learn: 1.8420562	total: 184ms	remaining: 2.21s
12:	learn: 1.8084880	total: 187ms	remaining: 2.06s
13:	learn: 1.7751836	total: 190ms	remaining: 1.93s
14:	learn: 1.7457418	total: 193ms	remaining: 1.82s
15:	learn: 1.7183254	total: 196ms	remaining: 1.71s
16:	learn: 1.6931998	total: 198ms	remaining: 1.62s
17:	learn: 1.6701537	total: 200ms	remaining: 1.53s
18:	learn: 1.6477054	total: 2

RandomizedSearchCV(cv=5,
                   estimator=<catboost.core.CatBoostRegressor object at 0x00000243AA8193D0>,
                   n_iter=100, n_jobs=-1,
                   param_distributions={'depth': [2, 3, 4, 5, 6, 7],
                                        'iterations': [145, 150, 155, 156, 157],
                                        'learning_rate': [0.05, 0.045, 0.054,
                                                          0.055, 0.056, 0.057,
                                                          0.058]},
                   verbose=2)

In [22]:
print(cb_reg.best_score_)
print(cb_reg.best_params_)

0.7016879255090724
{'learning_rate': 0.056, 'iterations': 156, 'depth': 3}


In [27]:
print("r2 for train data: ",cb_reg.score(X_train,Y_train))
print("adjusted_r2 for train data: ",adj_R2(X_train,Y_train,cb_reg),'\n')

print("r2 for test data: ",cb_reg.score(X_test,Y_test))
print("adjusted_r2 for test data: ",adj_R2(X_test,Y_test,cb_reg))

r2 for train data:  0.7078826865542035
adjusted_r2 for train data:  0.7074965150176271 

r2 for test data:  0.7180505575260054
adjusted_r2 for test data:  0.7165534808403028
