In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

from sklearn.model_selection import GridSearchCV

In [289]:
data = pd.read_csv('Final_data.csv')

In [290]:
data.head()

Unnamed: 0,Date,Customer_Name,Item_Name,Vrh_No,Quantity,Price_per_unit,Amount
0,2019-01-04,customer1,Item_1,1,200.0,20.0,4000.0
1,2019-01-04,customer1,Item_2,1,160.0,28.0,4480.0
2,2019-01-04,customer1,Item_3,1,12.0,60.0,720.0
3,2019-01-04,customer1,Item_3,1,15.0,35.0,525.0
4,2019-01-04,customer1,Item_3,1,25.0,25.0,625.0


In [291]:
data['Customer_Name']=data['Customer_Name'].apply(lambda x:x.lstrip('customer'))
data['Item_Name']=data['Item_Name'].apply(lambda x:x.lstrip('Item_'))

In [292]:
data.head()

Unnamed: 0,Date,Customer_Name,Item_Name,Vrh_No,Quantity,Price_per_unit,Amount
0,2019-01-04,1,1,1,200.0,20.0,4000.0
1,2019-01-04,1,2,1,160.0,28.0,4480.0
2,2019-01-04,1,3,1,12.0,60.0,720.0
3,2019-01-04,1,3,1,15.0,35.0,525.0
4,2019-01-04,1,3,1,25.0,25.0,625.0


In [293]:
dep_var=['Customer_Name','Item_Name','Quantity','Price_per_unit']
dep_var

['Customer_Name', 'Item_Name', 'Quantity', 'Price_per_unit']

In [294]:
y=data.Amount
x=data[dep_var]

In [295]:
x.head()

Unnamed: 0,Customer_Name,Item_Name,Quantity,Price_per_unit
0,1,1,200.0,20.0
1,1,2,160.0,28.0
2,1,3,12.0,60.0
3,1,3,15.0,35.0
4,1,3,25.0,25.0


In [296]:
data['Customer_Name'] = data['Customer_Name'].astype('int')
data['Item_Name'] = data['Item_Name'].astype('int')

In [297]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.3,random_state=123)

In [298]:
models = [LinearRegression(),Lasso(), Ridge(),DecisionTreeRegressor(random_state=123),RandomForestRegressor(random_state=123),
         SVR(),XGBRegressor(random_state=123)]
  
def model_eval(models):
    for i in range(len(models)):
        models[i].fit(X_train_s, Y_train)
        print(f'{models[i]} : ')
        
        train_preds = models[i].predict(X_train_s)
        print('Training Error(RMSE) : ', mean_squared_error(Y_train, train_preds,squared=False))
        
        val_preds = models[i].predict(X_test_s)
        print('Validation Error(RMSE) : ', mean_squared_error(Y_test, val_preds,squared=False))
        print()

In [299]:
model_eval(models)

LinearRegression() : 
Training Error(RMSE) :  5239.601181483426
Validation Error(RMSE) :  5454.589095775175

Lasso() : 
Training Error(RMSE) :  5239.601516388987
Validation Error(RMSE) :  5454.427422583814

Ridge() : 
Training Error(RMSE) :  5239.607747129009
Validation Error(RMSE) :  5454.034685363089

DecisionTreeRegressor(random_state=123) : 
Training Error(RMSE) :  0.0
Validation Error(RMSE) :  2921.065579267405

RandomForestRegressor(random_state=123) : 
Training Error(RMSE) :  645.9579064699304
Validation Error(RMSE) :  2277.7712619206213

SVR() : 
Training Error(RMSE) :  12895.971908918096
Validation Error(RMSE) :  13080.428984155109

XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
           

### Hypertuned XGBRegressor

In [300]:
xgb1 = XGBRegressor()
parameters = {'objective':['reg:squarederror'],
              'learning_rate': [.03,.07,0.1,0.3,0.6],
              'booster':['gbtree','gblinear','dart']}

xgb_grid = GridSearchCV(xgb1,parameters,cv = 2,n_jobs = 5)

xgb_grid.fit(X_train_s,Y_train)

GridSearchCV(cv=2,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    callbacks=None, colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    feature_types=None, gamma=None, gpu_id=None,
                                    grow_policy=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, m...
                                    max_cat_to_onehot=None, max_delta_step=None,
                                    max_depth=None, max_leaves=None,
                                    min_child_weight=None, missing=nan,
                                    monotone_constraints=None, n_estim

In [301]:
xgb_grid.best_params_

{'booster': 'dart', 'learning_rate': 0.6, 'objective': 'reg:squarederror'}

In [302]:
mod=XGBRegressor(random_state=123,booster='dart',learning_rate=0.6,objective='reg:squarederror')
model=mod.fit(X_train_s, Y_train)
train_preds = model.predict(X_train_s)
print('Training RMSE : ', mean_squared_error(Y_train, train_preds,squared=False))
val_preds = model.predict(X_test_s)
print('Validation RMSE : ', mean_squared_error(Y_test, val_preds,squared=False))

Training RMSE :  19.49762173017912
Validation RMSE :  2374.0195605281774


### Hypertuned RandomForestRegressor

In [23]:
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 20, num = 20)]
max_features = ['auto','sqrt','log2']
max_depth = [int(x) for x in np.linspace(10, 120, num = 12)]
min_samples_split = [2, 6, 10]
min_samples_leaf = [1, 3, 4]
bootstrap = [True, False]
oob_score = [True, False]
warm_start = [True, False]
criterion = ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']

random_grid = {'n_estimators': n_estimators,'max_features': max_features,'max_depth': max_depth,'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap,
'criterion':criterion,
'oob_score':oob_score,
'warm_start':warm_start}

In [24]:
rf1 = RandomForestRegressor()
rf1_grid = GridSearchCV(rf1, random_grid, cv = 4, n_jobs = -1)

rf1_grid.fit(X_train_s,Y_train)

GridSearchCV(cv=4, estimator=RandomForestRegressor(), n_jobs=-1,
             param_grid={'bootstrap': [True, False],
                         'criterion': ['squared_error', 'absolute_error',
                                       'friedman_mse', 'poisson'],
                         'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100,
                                       110, 120],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 3, 4],
                         'min_samples_split': [2, 6, 10],
                         'n_estimators': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                          13, 14, 15, 16, 17, 18, 19, 20],
                         'oob_score': [True, False],
                         'warm_start': [True, False]})

In [25]:
rf1_grid.best_params_

{'bootstrap': False,
 'criterion': 'absolute_error',
 'max_depth': 20,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 20,
 'oob_score': False,
 'warm_start': False}

In [303]:
m=RandomForestRegressor(random_state=123,bootstrap = False,criterion='absolute_error',max_depth=20,
                        min_samples_leaf=1,min_samples_split=2,n_estimators=20,max_features='log2')
model=m.fit(X_train_s, Y_train)
train_preds = model.predict(X_train_s)
print('Training RMSE : ', mean_squared_error(Y_train, train_preds,squared=False))
val_preds = model.predict(X_test_s)
print('Validation RMSE : ', mean_squared_error(Y_test, val_preds,squared=False))

Training RMSE :  4.431750352821918
Validation RMSE :  2558.5590589925114


### Final model

In [320]:
Actual_Amount= train_test_split(data['Amount'],test_size=0.3,random_state=123)
X_train['Actual_Sales'] = Actual_Amount[0]

In [327]:
final_model=RandomForestRegressor(random_state=123)
final_model.fit(X_train_s,Y_train)

RandomForestRegressor(random_state=123)

In [328]:
X_train["Predicted_Sales"] = final_model.predict(X_train_s)

In [329]:
X_train

Unnamed: 0,Customer_Name,Item_Name,Quantity,Price_per_unit,Actual_Sales,Predicted_Sales
1685,20,3,10.00,50.0,500.0,503.100
385,8,29,36.00,36.0,1296.0,1299.243
1224,68,7,2270.52,20.0,45410.4,44399.745
116,2,7,65.00,22.0,1430.0,1398.300
1114,2,3,5.00,29.0,145.0,148.690
...,...,...,...,...,...,...
1122,103,16,12.00,95.0,1140.0,1129.620
1346,2,27,30.00,49.0,1470.0,1493.100
1406,2,6,25.00,15.0,375.0,377.050
1389,2,3,30.00,19.0,570.0,574.960
