# Model Training on the processed data

In [16]:
import numpy as np
import pandas as pd

In [17]:
# load the processed data
df = pd.read_pickle('data/data.pickle')

In [6]:
# Get output/target feature
y = df.loc[:,'Item_Outlet_Sales']

# Get the input featues
X = df.drop(['Item_Outlet_Sales'], axis=1)

In [7]:
y

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: Item_Outlet_Sales, Length: 8523, dtype: float64

#### Random Forest Regressor

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

rf_model = RandomForestRegressor(n_estimators=100, random_state=41)


In [9]:
scores = cross_val_score(rf_model, X, y, cv=5, scoring='r2')

In [10]:
scores.mean()

np.float64(0.5500348089756637)

#### XGBRFRegressor

In [11]:
from xgboost import XGBRFRegressor

xgbrf = XGBRFRegressor(n_estimators=100, random_state=41)

In [12]:
scores2 = cross_val_score(xgbrf, X, y, cv=5, scoring='r2')

In [13]:
scores2.mean()

np.float64(0.5959515932274645)

we can see xgb-random forest is best model for this dataset.

In [18]:
xg = xgbrf.fit(X,y)
features = pd.Series(np.array(xg.feature_importances_), index=X.columns)

In [19]:
print("Feature Importances:\n", features.sort_values(ascending=False))

Feature Importances:
 Outlet_Type                0.480463
Outlet_Identifier          0.167878
Filled_Outlet_Size         0.135938
Item_MRP                   0.104605
Outlet_age                 0.091803
Outlet_Location_Type       0.012517
Item_Visibility_linear     0.002260
Item_Weight_interpolate    0.001596
Item_Type                  0.001348
Item_Identifier            0.001101
Item_Fat_Content           0.000491
dtype: float32


we will take only top 5 important features only.\
we will check the model accuracy on these 5 features.

In [20]:
imp_featues = ['Outlet_Type','Outlet_Identifier','Filled_Outlet_Size' ,'Item_MRP' ,'Outlet_age'] 

In [21]:
# lets check accuracy
scores3 = cross_val_score(xgbrf, X[imp_featues], y, cv=5, scoring='r2')

In [22]:
print("accuracy with all input features: ", scores2.mean())
print("accuracy with top 5 important features: ", scores3.mean())

accuracy with all input features:  0.5959515932274645
accuracy with top 5 important features:  0.5962926030699023


we can see the model's accuracy is same(approximately).

#### Best Model

In [23]:
final_X = X[imp_featues]

In [24]:
from xgboost import XGBRFRegressor

xgbrf_final = XGBRFRegressor(n_estimators=100, random_state=41)

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error as mae
# split the data
X_train, X_test, y_train, y_test = train_test_split(final_X, y, test_size=0.2, random_state=41)

In [26]:
xgbrf_final.fit(X_train, y_train)

In [27]:
y_pred = xgbrf_final.predict(X_test)

In [28]:
# mean absolute error
error = mae(y_test, y_pred)
print(error)

790.7269986159374


#### Prediction on unseen data

In [29]:
predicted = xgbrf_final.predict(np.array([[1 ,9, 1, 249, 26]]))
print(predicted)
# we can make prediction in ranges
print(f"Sales values is in [{(predicted-error)[0]}, {(predicted+error)[0]}]")

[3971.5322]
Sales values is in [3180.80517578125, 4762.25927734375]


#### Save the Model Using Joblib

In [30]:
import joblib

In [31]:
joblib.dump(xgbrf_final, 'data/model')

['data/model']

In [None]:
joblib.load('load/model')