In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [0]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [0]:
df = pd.read_csv (r'/content/drive/My Drive/Data/Smartbox/Smartbox_Outliers_Removed.csv' )


In [0]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [0]:
df.head()

Unnamed: 0,date,daily_forecast,daily_sales,month,year,day,product_name,daily_forecast_zscore,daily_sales_zscore
0,2017-01-01,39000,40608.0,1,2017,Sunday,Product A,2.197565,1.864841
1,2017-01-02,91000,95839.2,1,2017,Monday,Product A,1.066145,0.715437
2,2017-01-03,106000,104976.0,1,2017,Tuesday,Product A,0.739774,0.525293
3,2017-01-04,113000,120771.0,1,2017,Wednesday,Product A,0.587468,0.196587
4,2017-01-05,135000,141102.0,1,2017,Thursday,Product A,0.10879,0.226517


##Modelling on ONE product "Product A" 

In [0]:
df_product_A = df[df['product_name'] == 'Product A']
df_product_A.shape
df_product_A.set_index ('date', inplace = True)

#####Splitting the data

In [0]:
#Splitting the data with the date
train_dataset = df_product_A[(df_product_A.index < last_5pct)]
test_dataset = df_product_A[(df_product_A.index >= last_5pct)]

### Shuffling the dataset

In [0]:
dataset_drop = df_product_A.drop(['daily_forecast', 'daily_forecast_zscore', 'daily_sales_zscore'], axis = 1)
                                 




In [0]:
from sklearn.utils import shuffle
dataset_random = shuffle(dataset_drop,random_state = 45)

In [0]:
dataset_label = dataset_random.pop('daily_sales')

### Sorting and spittig data

In [0]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [0]:
train_dataset_random, test_dataset_random, train_label, test_label = train_test_split(dataset_random, dataset_label, test_size=0.2,
                                                  random_state=42)

### Assigining the target variable

### Convert categorical columns with one hot encoding

In [0]:
train_dataset_random_string = pd.get_dummies (train_dataset_random.astype(str))
test_dataset_random_string = pd.get_dummies (test_dataset_random.astype(str))

In [0]:
print ("Train",train_dataset_random_string.shape)
print ("Test",test_dataset_random_string.shape)
print ("Test",train_label.shape)
print ("Test",test_label.shape)

Train (552, 22)
Test (138, 22)
Test (552,)
Test (138,)


##### All the columns in the dataframe are categorical so converting whole dataframe into string

### Applying K-FOLD splitting

In [0]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold  # u should do STANDARD SCALER HERE feature scaling
def run_kfold(reg):
    kf = KFold (n_splits = 8, shuffle = True, random_state = 45)
     
    outcomes = []
    fold = []
    for train_index, test_index in kf.split (train_dataset_random_string, train_label):
        #fold +=1
        X_train, X_test = train_dataset_random_string.iloc[train_index] , train_dataset_random_string.iloc[test_index]
        y_train, y_test = train_label.iloc[train_index], train_label.iloc[test_index]
        reg.fit (X_train,y_train)
        y_pred = reg.predict (X_test)
        accuracy = np.sqrt (mean_squared_error (y_test,y_pred))
        outcomes.append (accuracy)
        print("Fold  accuracy: {}".format(accuracy))
    mean_outcome = np.mean (outcomes)
    print ("Mean Accuracy {}" . format (mean_outcome))


Fold  accuracy: 29391.108634965873
Fold  accuracy: 31088.2792444032
Fold  accuracy: 20846.069913961437
Fold  accuracy: 28020.203824911212
Fold  accuracy: 26686.23562716775
Fold  accuracy: 29035.094554166746
Fold  accuracy: 22252.708393886514
Fold  accuracy: 27537.85042480935
Mean Accuracy 26857.19382728401


### Testing with Linear Regression

In [0]:
reg = LinearRegression(normalize=True)
run_kfold (reg)

### Testing with Random Forest Regressor

In [0]:
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor(random_state = 1, n_estimators = 10)
run_kfold (reg)

###Testing with XGBoost

In [0]:
from sklearn.ensemble import GradientBoostingRegressor
reg = GradientBoostingRegressor (random_state=1)
run_kfold (reg)

#### After analysing the RMSE from K-Fold splitting  XGBoost and Random Forest gives better predictions than other regressor techniques

####Model Optimization on whole dataset

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn import metrics

rmse_scorer = make_scorer(metrics.mean_absolute_error, greater_is_better=False)

pipe = Pipeline([("regressor", None)])

gbt = GradientBoostingRegressor(random_state=42)
rf = RandomForestRegressor(random_state=42)


parameters = [{"regressor": [rf],
               "regressor__n_estimators": range(20, 150, 25),
               "regressor__max_depth": range(10, 30, 5)
               },
              {"regressor": [gbt],
               "regressor__n_estimators": range(100, 500, 25),
               "regressor__max_depth": range(3, 5)}
              ]


grid_search = GridSearchCV(pipe, parameters, cv=5, scoring=rmse_scorer)
grid_search.fit(train_dataset_random_string, train_label)

print(grid_search.cv_results_)
print(grid_search.best_estimator_)

{'mean_fit_time': array([0.01929483, 0.04108758, 0.06230974, 0.0839427 , 0.10521469,
       0.12959428, 0.02306466, 0.04972763, 0.07695765, 0.10336027,
       0.13099995, 0.16301422, 0.02481842, 0.05201621, 0.08029737,
       0.10780849, 0.13666849, 0.16174402, 0.02395129, 0.05169573,
       0.07759981, 0.1058259 , 0.13504043, 0.16426811, 0.03645859,
       0.04377012, 0.05331917, 0.06112132, 0.06882668, 0.07924523,
       0.08873158, 0.09507728, 0.10291719, 0.11670671, 0.12261052,
       0.13088794, 0.13493443, 0.14362259, 0.15513787, 0.15922008,
       0.04618359, 0.05818329, 0.06813302, 0.07966385, 0.09568176,
       0.10109816, 0.11391845, 0.1235415 , 0.13820825, 0.14707422,
       0.15979676, 0.17074161, 0.17926764, 0.19413891, 0.20199666,
       0.21177506]), 'std_fit_time': array([0.0009514 , 0.0006119 , 0.00035196, 0.00105918, 0.00082184,
       0.00425098, 0.00046515, 0.00300359, 0.00822192, 0.00241728,
       0.00520577, 0.00472418, 0.0014278 , 0.00258895, 0.00151681,
       



In [0]:
  reg = grid_search.best_estimator_

In [0]:
reg

Pipeline(memory=None,
         steps=[('regressor',
                 GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse',
                                           init=None, learning_rate=0.1,
                                           loss='ls', max_depth=4,
                                           max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                           min_samples_split=2,
                                           min_weight_fraction_leaf=0.0,
                                           n_estimators=100,
                                           n_iter_no_change=None,
                                           presort='auto', random_state=42,
                                           subsample=1.0, tol

In [0]:
reg.fit(train_dataset_random_string, train_label)
y_pred_train = reg.predict(train_dataset_random_string)


In [0]:
print("RMSE on Training data", np.sqrt(metrics.mean_squared_error(train_label, y_pred_train)))
print("MAE on Training data ", metrics.mean_absolute_error(train_label, y_pred_train))

RMSE on Training data 18015.17708641484
MAE on Training data  10424.472636586102


In [0]:
print ( "Test dataset shape"  ,test_dataset_random_string.shape)
print ( "Train dataset shape"  ,train_dataset_random_string.shape)

Test dataset shape (138, 22)
Train dataset shape (552, 22)


##Applying model on the test dataset

In [0]:
y_pred_test = reg.predict(test_dataset_random_string)



In [0]:
print("RMSE on Test data ", np.sqrt(metrics.mean_squared_error(test_label,
                                                             y_pred_test)))
print("MAE on Test data", metrics.mean_absolute_error(test_label,
                                                            y_pred_test))

RMSE on Test data  25978.771141950543
MAE on Test data 13756.755764879228
