##Importing Dataset to colab folder

Upload the dataset in colab folder

In [1]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


##**Reading Dataset**

###Importing Packages

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import sklearn

import warnings
warnings.filterwarnings('ignore')

###Reading data set



1.   Loading dataset using pandas
2.   View them

In [4]:
df_train = pd.read_csv('/content/drive/MyDrive/My File/modified_train.csv')

In [5]:
df_train.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,New_Item_Type
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138,Food
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228,Drinks
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27,Food
3,FDX07,19.2,Regular,0.066132,Fruits and Vegetables,182.095,OUT010,1998,Small,Tier 3,Grocery Store,732.38,Food
4,NCD19,8.93,Non-Edible,0.066132,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052,Non-Consumable


In [None]:
df_test = pd.read_csv('/content/modified_test.csv')

In [None]:
df_test.head()

## **Model building**

### Test & Train Split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_df, test_df = train_test_split(df_train, test_size=0.2, random_state=42)

In [9]:
print('train_df.shape :', train_df.shape)
print('test_df.shape :', test_df.shape)

train_df.shape : (6818, 13)
test_df.shape : (1705, 13)


### Finding Target and Input Columns

In [10]:
train_df.head(2)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales,New_Item_Type
549,FDW44,9.5,Regular,0.035206,Fruits and Vegetables,171.3448,OUT049,1999,Medium,Tier 1,Supermarket Type1,2386.2272,Food
7757,NCF54,18.0,Non-Edible,0.047473,Household,170.5422,OUT045,2002,Small,Tier 2,Supermarket Type1,3103.9596,Non-Consumable


In [11]:
train_df.iloc[:,[1,2,3,5,6,7,8,9,10,12]]

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,New_Item_Type
549,9.500,Regular,0.035206,171.3448,OUT049,1999,Medium,Tier 1,Supermarket Type1,Food
7757,18.000,Non-Edible,0.047473,170.5422,OUT045,2002,Small,Tier 2,Supermarket Type1,Non-Consumable
764,17.600,Regular,0.076122,111.7202,OUT046,1997,Small,Tier 1,Supermarket Type1,Food
6867,8.325,Low Fat,0.029845,41.6138,OUT045,2002,Small,Tier 2,Supermarket Type1,Food
2716,12.850,Low Fat,0.137228,155.5630,OUT046,1997,Small,Tier 1,Supermarket Type1,Food
...,...,...,...,...,...,...,...,...,...,...
5734,9.395,Regular,0.286345,139.1838,OUT010,1998,Small,Tier 3,Grocery Store,Food
5191,15.600,Low Fat,0.117575,75.6670,OUT017,2007,Small,Tier 2,Supermarket Type1,Food
5390,17.600,Non-Edible,0.018944,237.3590,OUT045,2002,Small,Tier 2,Supermarket Type1,Non-Consumable
860,20.350,Low Fat,0.054363,117.9466,OUT017,2007,Small,Tier 2,Supermarket Type1,Food


In [12]:
X_column=  list(train_df.iloc[:,[1,2,3,5,6,7,8,9,10,12]])
y_column = 'Item_Outlet_Sales'

In [13]:
X_column

['Item_Weight',
 'Item_Fat_Content',
 'Item_Visibility',
 'Item_MRP',
 'Outlet_Identifier',
 'Outlet_Establishment_Year',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type',
 'New_Item_Type']

In [14]:
y_column

'Item_Outlet_Sales'

We can now create a copy of inputs and targets for the training and test sets for further processing and model training without disturbing the original data.

In [15]:
X_train = train_df[X_column].copy()
y_train = train_df[y_column].copy()

In [16]:
X_test = test_df[X_column].copy()
y_test = test_df[y_column].copy()

In [17]:
X_train.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,New_Item_Type
549,9.5,Regular,0.035206,171.3448,OUT049,1999,Medium,Tier 1,Supermarket Type1,Food
7757,18.0,Non-Edible,0.047473,170.5422,OUT045,2002,Small,Tier 2,Supermarket Type1,Non-Consumable
764,17.6,Regular,0.076122,111.7202,OUT046,1997,Small,Tier 1,Supermarket Type1,Food
6867,8.325,Low Fat,0.029845,41.6138,OUT045,2002,Small,Tier 2,Supermarket Type1,Food
2716,12.85,Low Fat,0.137228,155.563,OUT046,1997,Small,Tier 1,Supermarket Type1,Food


In [18]:
y_train.head()

549     2386.2272
7757    3103.9596
764     1125.2020
6867     284.2966
2716    4224.5010
Name: Item_Outlet_Sales, dtype: float64

In [19]:
X_test.head()

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,New_Item_Type
7503,14.3,Low Fat,0.0263,79.4302,OUT013,1987,High,Tier 3,Supermarket Type1,Food
2957,7.93,Non-Edible,0.071136,42.7086,OUT046,1997,Small,Tier 1,Supermarket Type1,Non-Consumable
7031,14.5,Regular,0.041313,42.0454,OUT049,1999,Medium,Tier 1,Supermarket Type1,Food
1084,13.0,Regular,0.044767,173.7054,OUT027,1985,Medium,Tier 3,Supermarket Type3,Drinks
856,10.195,Regular,0.012456,197.511,OUT035,2004,Small,Tier 2,Supermarket Type1,Food


In [20]:
y_test.head()

7503    1743.0644
2957     356.8688
7031     377.5086
1084    5778.4782
856     2356.9320
Name: Item_Outlet_Sales, dtype: float64

### Numerical / Categorical data

In [21]:
X_train.head(2)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,New_Item_Type
549,9.5,Regular,0.035206,171.3448,OUT049,1999,Medium,Tier 1,Supermarket Type1,Food
7757,18.0,Non-Edible,0.047473,170.5422,OUT045,2002,Small,Tier 2,Supermarket Type1,Non-Consumable


In [22]:
numeric_columns = X_train.select_dtypes(include=np.number).columns.tolist()

In [23]:
numeric_columns

['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Outlet_Establishment_Year']

In [24]:
categorical_columns = X_train.select_dtypes(include='object').columns.tolist()

In [25]:
categorical_columns

['Item_Fat_Content',
 'Outlet_Identifier',
 'Outlet_Size',
 'Outlet_Location_Type',
 'Outlet_Type',
 'New_Item_Type']

In [26]:
X_train[numeric_columns].describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year
count,6818.0,6818.0,6818.0,6818.0
mean,12.907901,0.070239,141.905134,1997.85685
std,4.644588,0.04852,62.547789,8.395692
min,4.555,0.003575,31.29,1985.0
25%,8.895,0.033084,94.1752,1987.0
50%,12.65,0.063021,144.2628,1999.0
75%,17.0,0.094791,186.8556,2004.0
max,21.35,0.328391,266.8884,2009.0


#### Imputing numeric columns

In [27]:
from sklearn.impute import SimpleImputer

In [28]:
imputer = SimpleImputer(strategy='median')

In [29]:
imputer.fit(df_train[numeric_columns])

SimpleImputer(strategy='median')

In [30]:
X_train[numeric_columns] = imputer.transform(X_train[numeric_columns])
X_test[numeric_columns] = imputer.transform(X_test[numeric_columns])

In [31]:
X_test[numeric_columns].isna().sum()

Item_Weight                  0
Item_Visibility              0
Item_MRP                     0
Outlet_Establishment_Year    0
dtype: int64

In [32]:
X_train[numeric_columns].isna().sum()

Item_Weight                  0
Item_Visibility              0
Item_MRP                     0
Outlet_Establishment_Year    0
dtype: int64

### Scaling numeric values

Let's use MinMaxScaler from sklearn.preprocessing to scale values to the  (0,1)  range

In [33]:
from sklearn.preprocessing import MinMaxScaler

In [34]:
scaler = MinMaxScaler()

In [35]:
scaler.fit(df_train[numeric_columns])

MinMaxScaler()

In [36]:
X_train[numeric_columns] = scaler.transform(X_train[numeric_columns])
X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])

In [37]:
X_train[numeric_columns].describe()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year
count,6818.0,6818.0,6818.0,6818.0
mean,0.497345,0.205237,0.469507,0.535702
std,0.276546,0.149376,0.265485,0.349821
min,0.0,0.0,0.0,0.0
25%,0.25841,0.090849,0.266917,0.083333
50%,0.481989,0.183015,0.479514,0.583333
75%,0.740994,0.280823,0.6603,0.791667
max,1.0,1.0,1.0,1.0


In [38]:
X_train.head(2)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,New_Item_Type
549,0.294433,Regular,0.097382,0.594464,OUT049,0.583333,Medium,Tier 1,Supermarket Type1,Food
7757,0.800536,Non-Edible,0.135149,0.591057,OUT045,0.708333,Small,Tier 2,Supermarket Type1,Non-Consumable


We have scaled the values

### Encoding categorical values

In [39]:
X_train[categorical_columns].nunique()

Item_Fat_Content         3
Outlet_Identifier       10
Outlet_Size              3
Outlet_Location_Type     3
Outlet_Type              4
New_Item_Type            3
dtype: int64

In [40]:
from sklearn.preprocessing import OneHotEncoder

In [41]:
encoder = OneHotEncoder(sparse=False,handle_unknown='ignore')

In [42]:
df = df_train[categorical_columns]

In [43]:
encoder.fit(df)

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [44]:
encoder.categories_

[array(['Low Fat', 'Non-Edible', 'Regular'], dtype=object),
 array(['OUT010', 'OUT013', 'OUT017', 'OUT018', 'OUT019', 'OUT027',
        'OUT035', 'OUT045', 'OUT046', 'OUT049'], dtype=object),
 array(['High', 'Medium', 'Small'], dtype=object),
 array(['Tier 1', 'Tier 2', 'Tier 3'], dtype=object),
 array(['Grocery Store', 'Supermarket Type1', 'Supermarket Type2',
        'Supermarket Type3'], dtype=object),
 array(['Drinks', 'Food', 'Non-Consumable'], dtype=object)]

In [45]:
encoded_columns = list(encoder.get_feature_names(categorical_columns))
print(encoded_columns)

['Item_Fat_Content_Low Fat', 'Item_Fat_Content_Non-Edible', 'Item_Fat_Content_Regular', 'Outlet_Identifier_OUT010', 'Outlet_Identifier_OUT013', 'Outlet_Identifier_OUT017', 'Outlet_Identifier_OUT018', 'Outlet_Identifier_OUT019', 'Outlet_Identifier_OUT027', 'Outlet_Identifier_OUT035', 'Outlet_Identifier_OUT045', 'Outlet_Identifier_OUT046', 'Outlet_Identifier_OUT049', 'Outlet_Size_High', 'Outlet_Size_Medium', 'Outlet_Size_Small', 'Outlet_Location_Type_Tier 1', 'Outlet_Location_Type_Tier 2', 'Outlet_Location_Type_Tier 3', 'Outlet_Type_Grocery Store', 'Outlet_Type_Supermarket Type1', 'Outlet_Type_Supermarket Type2', 'Outlet_Type_Supermarket Type3', 'New_Item_Type_Drinks', 'New_Item_Type_Food', 'New_Item_Type_Non-Consumable']


In [46]:
len(encoded_columns)

26

In [47]:
X_train[encoded_columns] = encoder.transform(X_train[categorical_columns])
X_test[encoded_columns] = encoder.transform(X_test[categorical_columns])

In [48]:
X_train.head(2)

Unnamed: 0,Item_Weight,Item_Fat_Content,Item_Visibility,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,New_Item_Type,...,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,New_Item_Type_Drinks,New_Item_Type_Food,New_Item_Type_Non-Consumable
549,0.294433,Regular,0.097382,0.594464,OUT049,0.583333,Medium,Tier 1,Supermarket Type1,Food,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7757,0.800536,Non-Edible,0.135149,0.591057,OUT045,0.708333,Small,Tier 2,Supermarket Type1,Non-Consumable,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [49]:
X_train[numeric_columns + encoded_columns]

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Item_Fat_Content_Low Fat,Item_Fat_Content_Non-Edible,Item_Fat_Content_Regular,Outlet_Identifier_OUT010,Outlet_Identifier_OUT013,Outlet_Identifier_OUT017,...,Outlet_Location_Type_Tier 1,Outlet_Location_Type_Tier 2,Outlet_Location_Type_Tier 3,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3,New_Item_Type_Drinks,New_Item_Type_Food,New_Item_Type_Non-Consumable
549,0.294433,0.097382,0.594464,0.583333,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
7757,0.800536,0.135149,0.591057,0.708333,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
764,0.776719,0.223348,0.341387,0.500000,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
6867,0.224472,0.080878,0.043819,0.708333,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2716,0.493897,0.411473,0.527478,0.500000,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,0.288181,0.870554,0.457956,0.541667,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5191,0.657636,0.350967,0.188359,0.916667,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5390,0.776719,0.047316,0.874662,0.708333,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
860,0.940458,0.156359,0.367815,0.916667,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [50]:
X_train_df = X_train[numeric_columns + encoded_columns]

In [51]:
X_test_df = X_test[numeric_columns + encoded_columns]

In [52]:
X_train_df.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6818 entries, 549 to 7270
Data columns (total 30 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Item_Weight                    6818 non-null   float64
 1   Item_Visibility                6818 non-null   float64
 2   Item_MRP                       6818 non-null   float64
 3   Outlet_Establishment_Year      6818 non-null   float64
 4   Item_Fat_Content_Low Fat       6818 non-null   float64
 5   Item_Fat_Content_Non-Edible    6818 non-null   float64
 6   Item_Fat_Content_Regular       6818 non-null   float64
 7   Outlet_Identifier_OUT010       6818 non-null   float64
 8   Outlet_Identifier_OUT013       6818 non-null   float64
 9   Outlet_Identifier_OUT017       6818 non-null   float64
 10  Outlet_Identifier_OUT018       6818 non-null   float64
 11  Outlet_Identifier_OUT019       6818 non-null   float64
 12  Outlet_Identifier_OUT027       6818 non-null  

## **Model Training**

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
model_lin = LinearRegression()

In [None]:
model_lin.fit(X_train_df, y_train)

LinearRegression()

In [None]:
y_pred_lin=model_lin.predict(X_test_df)

In [None]:
y_pred_lin

array([1373.95413403,  687.12097622,  858.4071134 , ...,  828.60434808,
        581.21503287, 1720.70502732])

In [None]:
y_test

7503    1743.0644
2957     356.8688
7031     377.5086
1084    5778.4782
856     2356.9320
          ...    
7205    3004.0896
3257     890.8404
6346     629.1810
6318     253.0040
6339     976.7286
Name: Item_Outlet_Sales, Length: 1705, dtype: float64

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred_lin)

0.581089128077704

In [None]:
from sklearn import metrics
print('Accuracy:', round(model_lin.score(X_test_df,y_test)*100,2))
print('MAE:', metrics.mean_absolute_error(y_test, y_pred_lin))
print('MSE:', metrics.mean_squared_error(y_test, y_pred_lin))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_lin)))

Accuracy: 58.11
MAE: 790.54788752432
MSE: 1138587.2104651125
RMSE: 1067.046020781256


### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error,make_scorer

In [None]:
DT = DecisionTreeRegressor()

In [None]:
param = {
    'max_depth':[6,9,12,15],
    'min_samples_leaf':[10,50,100,150]
}

In [None]:
random_search = RandomizedSearchCV(DT,param_distributions=param,n_iter=5,scoring=make_scorer(mean_squared_error),n_jobs=-1,cv=5,verbose=3)

In [None]:
random_search.fit(X_train_df, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_iter=5, n_jobs=-1,
                   param_distributions={'max_depth': [6, 9, 12, 15],
                                        'min_samples_leaf': [10, 50, 100, 150]},
                   scoring=make_scorer(mean_squared_error), verbose=3)

In [None]:
means = random_search.cv_results_['mean_test_score']
params = random_search.cv_results_['params']
for mean, param in zip(means, params):
    print("%f with: %r" % (mean, param))
    if mean == min(means):
        print('Best parameters with the minimum Mean Square Error are:',param)

1249481.786662 with: {'min_samples_leaf': 150, 'max_depth': 12}
1253587.240535 with: {'min_samples_leaf': 50, 'max_depth': 12}
1449404.382734 with: {'min_samples_leaf': 10, 'max_depth': 12}
1233478.928570 with: {'min_samples_leaf': 10, 'max_depth': 6}
1216915.170973 with: {'min_samples_leaf': 50, 'max_depth': 6}
Best parameters with the minimum Mean Square Error are: {'min_samples_leaf': 50, 'max_depth': 6}


In [None]:
DT = DecisionTreeRegressor(min_samples_leaf=50, max_depth=6)
DT.fit(X_train_df,y_train)

DecisionTreeRegressor(max_depth=6, min_samples_leaf=50)

In [None]:
DT_predict = DT.predict(X_test_df)

In [None]:
DT_predict

array([1366.53523034,  704.13315912,  704.13315912, ...,  704.13315912,
        704.13315912, 1642.74411908])

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,DT_predict)

0.6128345399904291

In [None]:
from sklearn import metrics
print('Accuracy:', round(DT.score(X_test_df,y_test)*100,2))
print('MAE:', metrics.mean_absolute_error(y_test, DT_predict))
print('MSE:', metrics.mean_squared_error(y_test, DT_predict))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, DT_predict)))

Accuracy: 61.28
MAE: 719.3955276618045
MSE: 1052304.1311339072
RMSE: 1025.8187613481766


### Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,make_scorer
from sklearn.model_selection import RandomizedSearchCV

In [None]:
rf = RandomForestRegressor()

In [None]:
param = {
    'max_depth':[3,6,9,12,15],
    'n_estimators' : [10,50,100,150,200] 
}

In [None]:
rf_search = RandomizedSearchCV(rf,param_distributions=param,n_iter=5,scoring=make_scorer(mean_squared_error),n_jobs=-1,cv=5,verbose=3)

In [None]:
rf_search.fit(X_train_df, y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


RandomizedSearchCV(cv=5, estimator=RandomForestRegressor(), n_iter=5, n_jobs=-1,
                   param_distributions={'max_depth': [3, 6, 9, 12, 15],
                                        'n_estimators': [10, 50, 100, 150,
                                                         200]},
                   scoring=make_scorer(mean_squared_error), verbose=3)

In [None]:
means = rf_search.cv_results_['mean_test_score']
params = rf_search.cv_results_['params']
for mean, param in zip(means, params):
    print("%f with: %r" % (mean, param))
    if mean == min(means):
        print('Best parameters with the minimum Mean Square Error are:',param)

1226758.517140 with: {'n_estimators': 150, 'max_depth': 9}
1262612.383547 with: {'n_estimators': 150, 'max_depth': 12}
1211025.044481 with: {'n_estimators': 150, 'max_depth': 6}
Best parameters with the minimum Mean Square Error are: {'n_estimators': 150, 'max_depth': 6}
1375029.322269 with: {'n_estimators': 10, 'max_depth': 15}
1240052.083380 with: {'n_estimators': 50, 'max_depth': 9}


In [None]:
rf = RandomForestRegressor(n_estimators=150, max_depth=6)
rf.fit(X_train_df,y_train)

RandomForestRegressor(max_depth=6, n_estimators=150)

In [None]:
rf_pred =  rf.predict(X_test_df)

In [None]:
rf_pred

array([1258.71964532,  670.14813123,  662.63342904, ...,  662.63342904,
        734.71366926, 1670.8810444 ])

In [None]:
from sklearn.metrics import r2_score
r2_score(y_test,rf_pred)

0.6170610098589755

In [None]:
from sklearn import metrics
print('Accuracy:', round(rf.score(X_test_df,y_test)*100,2))
print('MAE:', metrics.mean_absolute_error(y_test, rf_pred))
print('MSE:', metrics.mean_squared_error(y_test, rf_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rf_pred)))

Accuracy: 61.71
MAE: 713.2252021129052
MSE: 1040816.7125437406
RMSE: 1020.2042504046632


### Save the model

In [None]:
import joblib

In [None]:
joblib.dump(DT, "Decision_tree_model.pkl",compress=2)

['Decision_tree_model.pkl']

In [None]:
joblib.dump(rf, "RForest_fitted_model.pkl",compress=2)

['RForest_fitted_model.pkl']

In [None]:
joblib.dump(imputer, "imputer.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(encoder, "encoder.pkl")
joblib.dump(X_column, "input_columns.pkl")
joblib.dump(y_column, "target_column.pkl")
joblib.dump(numeric_columns, "numeric_columns.pkl")
joblib.dump(categorical_columns, "categorical_columns.pkl")
joblib.dump(encoded_columns, "encoded_columns.pkl")

['encoded_columns.pkl']

In [None]:
import pandas as pd
pd. __version__


'1.3.5'

In [None]:
import numpy as np
np.__version__

'1.21.6'

In [None]:
import joblib as jb
jb.__version__

'1.1.0'

In [None]:
import sklearn
sklearn.__version__

'1.0.2'

In [None]:
import pickle 
print(pickle.format_version)

4.0
