# Import Libraries

In [1]:
import numpy as np #numerical computation
import pandas as pd #data wrangling
import matplotlib.pyplot as plt #plotting package

# Import Dataset 

In [2]:
df = pd.read_csv('clean_2_train.csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Item_Outlet_Sales,Outlet_Years,Item_Fat_Content_1,...,Item_Type_Combined_2,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,FDA15,9.3,0.92296,Dairy,249.8092,OUT049,1999,3735.138,14,0,...,0,0,0,0,0,0,0,0,0,1
1,DRC01,5.92,1.003057,Soft Drinks,48.2692,OUT018,2009,443.4228,4,0,...,0,0,0,1,0,0,0,0,0,0
2,FDN15,17.5,0.83199,Meat,141.618,OUT049,1999,2097.27,14,0,...,0,0,0,0,0,0,0,0,0,1
3,FDX07,19.2,0.75,Fruits and Vegetables,182.095,OUT010,1998,732.38,15,0,...,0,0,0,0,0,0,0,0,0,0
4,NCD19,8.93,0.666667,Household,53.8614,OUT013,1987,994.7052,26,1,...,1,1,0,0,0,0,0,0,0,0


#### Created a list remove_cols to remove those columns which doesn't required for Model Building 

In [3]:
remove_cols = [
    'Item_Identifier',
    'Item_Type',
    'Outlet_Identifier',
    'Outlet_Establishment_Year'
]
df = df.drop(remove_cols,axis =1)
df.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales,Outlet_Years,Item_Fat_Content_1,Item_Fat_Content_2,Outlet_Location_Type_1,Outlet_Location_Type_2,Outlet_Size_1,...,Item_Type_Combined_2,Outlet_1,Outlet_2,Outlet_3,Outlet_4,Outlet_5,Outlet_6,Outlet_7,Outlet_8,Outlet_9
0,9.3,0.92296,249.8092,3735.138,14,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,5.92,1.003057,48.2692,443.4228,4,0,1,0,1,1,...,0,0,0,1,0,0,0,0,0,0
2,17.5,0.83199,141.618,2097.27,14,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,19.2,0.75,182.095,732.38,15,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,8.93,0.666667,53.8614,994.7052,26,1,0,0,1,0,...,1,1,0,0,0,0,0,0,0,0


In [4]:
df.shape

(8519, 25)

#### Data Preprocessing 

In [5]:
y = df.Item_Outlet_Sales.values
X = df.drop('Item_Outlet_Sales',axis = 1)

In [6]:
print(X.shape,y.shape)

(8519, 24) (8519,)


### Trained a Random Forest model

In [7]:
from sklearn.tree import DecisionTreeRegressor
DT = DecisionTreeRegressor()

#### List of parameters for hyperparameter tuning

In [8]:
param = {
    'max_depth':[6,9,12,15],
    'min_samples_leaf':[10,50,100,150]
}

# Hyperparameter optimization using RandomizedSearchCV 

In [11]:
from sklearn.metrics import mean_squared_error,make_scorer
from sklearn.model_selection import RandomizedSearchCV
random_search=RandomizedSearchCV(DT,param_distributions=param,n_iter=5,scoring=make_scorer(mean_squared_error),n_jobs=-1,cv=5,verbose=3)

In [12]:
random_search.fit(X,y)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  19 out of  25 | elapsed:    3.0s remaining:    0.9s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:    3.1s finished


RandomizedSearchCV(cv=5, estimator=DecisionTreeRegressor(), n_iter=5, n_jobs=-1,
                   param_distributions={'max_depth': [6, 9, 12, 15],
                                        'min_samples_leaf': [10, 50, 100, 150]},
                   scoring=make_scorer(mean_squared_error), verbose=3)

In [13]:
means = random_search.cv_results_['mean_test_score']
params = random_search.cv_results_['params']
for mean, param in zip(means, params):
    print("%f with: %r" % (mean, param))
    if mean == min(means):
        print('Best parameters with the minimum Mean Square Error are:',param)

1218784.768530 with: {'min_samples_leaf': 100, 'max_depth': 6}
Best parameters with the minimum Mean Square Error are: {'min_samples_leaf': 100, 'max_depth': 6}
1234745.474520 with: {'min_samples_leaf': 100, 'max_depth': 15}
1253119.899713 with: {'min_samples_leaf': 50, 'max_depth': 12}
1238998.459008 with: {'min_samples_leaf': 10, 'max_depth': 6}
1243248.046485 with: {'min_samples_leaf': 50, 'max_depth': 9}


# Evaluating the model for Train and Test set 

In [14]:
DT = DecisionTreeRegressor(min_samples_leaf=100, max_depth=6)

In [15]:
DT.fit(X,y)

DecisionTreeRegressor(max_depth=6, min_samples_leaf=100)

In [16]:
y_pred = DT.predict(X)

In [17]:
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

score = r2_score(y,y_pred)
print("Score:",100*score)
print("MAE : %.4g" % np.sqrt(mean_absolute_error(y,y_pred)))
print("RMSE : %.4g" % np.sqrt(mean_squared_error(y,y_pred)))

Score: 59.532325752312374
MAE : 27.6
RMSE : 1086


In [18]:
from sklearn.metrics import mean_squared_error,make_scorer,mean_absolute_error
from sklearn.model_selection import cross_val_score

#Perform cross-validation:
cv_score = cross_val_score(DT,X, y, cv=20, scoring = make_scorer(mean_squared_error))
cv_score = np.sqrt(np.abs(cv_score))
    
#Print model report:
print("\nModel Report")
print("MAE : %.4g" % np.sqrt(mean_absolute_error(y,y_pred)))
print("RMSE : %.4g" % np.sqrt(mean_squared_error(y,y_pred)))
print("CV Score : Mean - %.4g | Std - %.4g | Min - %.4g | Max - %.4g" % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score)))


Model Report
MAE : 27.6
RMSE : 1086
CV Score : Mean - 1097 | Std - 45.76 | Min - 1032 | Max - 1211
