#### Project 2

In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('seaborn')
import numpy as np
import os
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
#warnings.filterwarnings('ignore')

In [2]:
data_path = os.path.join(os.getcwd(), 'F21_proj2_data')
train_ini = pd.read_csv(f'{data_path}/train_ini.csv',parse_dates = ['Date'], dayfirst = True)
test_ini = pd.read_csv(f'{data_path}/fold_1.csv',parse_dates = ['Date'], dayfirst = True)

In [3]:
print(train_ini.head(10))
print("Shape of the training data: ", train_ini.shape)
print("Shape of the training data: ", test_ini.shape)

   Store  Dept       Date  Weekly_Sales  IsHoliday
0      1     1 2010-02-05      24924.50      False
1      1     1 2010-02-12      46039.49       True
2      1     1 2010-02-19      41595.55      False
3      1     1 2010-02-26      19403.54      False
4      1     1 2010-03-05      21827.90      False
5      1     1 2010-03-12      21043.39      False
6      1     1 2010-03-19      22136.64      False
7      1     1 2010-03-26      26229.21      False
8      1     1 2010-04-02      57258.43      False
9      1     1 2010-04-09      42960.91      False
Shape of the training data:  (164115, 5)
Shape of the training data:  (26559, 5)


In [32]:
# Data transformation functions
train = train_ini.copy()
test = test_ini.copy()
train['IsHoliday'] = train['IsHoliday'].apply(lambda x: 1 if x else 0)
test['IsHoliday'] = train['IsHoliday'].apply(lambda x: 1 if x else 0)


# There are some negative values in the Weekly_Sales column, converting them to positive as they seem like data entry mistakes
train['Weekly_Sales'] = abs(train['Weekly_Sales'])
test['Weekly_Sales'] = abs(test['Weekly_Sales'])

# extracting year and week from the date field
train['Year'] = train['Date'].dt.year
train['Week'] = train['Date'].dt.isocalendar().week

test['Year'] = test['Date'].dt.year
test['Week'] = test['Date'].dt.isocalendar().week

# Sorting the data by Store, Department and Date
train.sort_values(['Store','Dept','Date'], ignore_index=True, ascending=True, inplace=True)
test.sort_values(['Store','Dept','Date'], ignore_index=True, ascending=True, inplace=True)


# Creating a column which has the previous week sales as a separate column
train['Last_Week_Sales'] = train.groupby(['Store','Dept'])['Weekly_Sales'].shift(1)
test['Last_Week_Sales'] = test.groupby(['Store','Dept'])['Weekly_Sales'].shift(1)

# Dropping date column as that information is already captured by Year and Week Column
del train['Date']
del test['Date']

In [34]:
# Creating 4th order difference for the weekly sale column
col = 'Weekly_Sales'
for order in [2,3,4]:
    train[f'{col}_D{order}'] = train.groupby(['Store','Dept'])[col].diff(periods=order)
    test[f'{col}_D{order}'] = test.groupby(['Store','Dept'])[col].diff(periods=order)
    
# Removing rows with missing values
train = train.dropna(subset = list(train.columns))
test = test.dropna(subset = list(test.columns))

# Creating X_train, Y_train
X_train = train.loc[:,~train.columns.isin(['Weekly_Sales'])]
y_train = train.loc[:,train.columns.isin(['Weekly_Sales'])]

# Creating X_test, Y_test
X_test = test.loc[:,~test.columns.isin(['Weekly_Sales'])]
y_test = test.loc[:,test.columns.isin(['Weekly_Sales'])]

In [9]:
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train_trans = scaler.transform(X_train)
# X_test_trans = scaler.transform(X_test)

In [36]:
## Fitting GBR, RF and NN on the training data set
parameters = {'max_features':['auto'],'max_depth':[20,25,30,35],'min_samples_leaf':[15,20,25,50]}
rf_mod = RandomForestRegressor(n_estimators=500, criterion='mse', oob_score=True, n_jobs=-1, random_state=10)
grid_search = GridSearchCV(rf_mod, parameters, cv=5, n_jobs=-1, scoring='neg_mean_squared_error', verbose = 10)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5,
             estimator=RandomForestRegressor(n_estimators=500, n_jobs=-1,
                                             oob_score=True, random_state=10),
             n_jobs=-1,
             param_grid={'max_depth': [20, 25, 30, 35],
                         'max_features': ['auto'],
                         'min_samples_leaf': [15, 20, 25, 50]},
             scoring='neg_mean_squared_error', verbose=10)

In [66]:
fin_mod = grid_search.best_estimator_
pred = fin_mod.predict(X_test).reshape(len(y_test), 1)
weights = test['IsHoliday'].apply(lambda is_holiday:5 if is_holiday == 1 else 1).to_numpy().reshape(len(pred),1)
actuals = y_test

In [69]:
np.sum(weights * np.abs(actuals - pred).to_numpy()) / np.sum(weights)

1238.0560409568982

In [63]:
(np.sum(np.abs(actuals - pred).to_numpy())) / len(np.abs(actuals - pred).to_numpy())

1226.3597226409154

In [12]:
# Filtering the data for a particular Store and Department id to csv
# train.loc[((train['Store'] == 1) & (train['Dept'] == 1)) | ((train['Store'] == 2) & (train['Dept'] == 1))].to_csv('data_check.csv', index = False)