In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestRegressor

<div style="background-color: lightblue; color: black; padding: 10px; font-weight: bold; font-size: 15px;">Read data - Here: Data with 3x rolling shifted average & all dummies</div>

In [20]:
df = pd.read_pickle('data/data_combined_clean_features.pkl')

<div style="background-color: lightblue; color: black; padding: 10px; font-weight: bold; font-size: 15px;">Train test split</div>

In [21]:
#define train, validation and test dataset
train = df[df['Date'] < "2012-01-06"].reset_index(drop=True)
validation = df[(df['Date'] >= "2012-01-06") & (df['Date'] <= "2012-06-08")].reset_index(drop=True)
test = df[df['Date'] > "2012-06-08"].reset_index(drop=True)

# drop date in each of the datasets
train = train.drop(columns='Date')
validation = validation.drop(columns='Date')
test = test.drop(columns='Date')

#define features and target
X_train = train.drop(columns='Weekly_Sales')
y_train = train['Weekly_Sales']

X_validation = validation.drop(columns='Weekly_Sales')
y_validation = validation['Weekly_Sales']

X_test = test.drop(columns='Weekly_Sales')
y_test = test['Weekly_Sales'] 

<div style="background-color: lightblue; color: black; padding: 10px; font-weight: bold; font-size: 15px;">Linear Regression</div>

In [22]:
# instantiate model
lin_reg = LinearRegression()

# fit the model with train data 
lin_reg.fit(X_train, y_train)

# Make predictions
y_pred_train = lin_reg.predict(X_train)
y_pred_validation = lin_reg.predict(X_validation)
y_pred_test = lin_reg.predict(X_test)

<div style="background-color: lightblue; color: black; padding: 10px; font-weight: bold; font-size: 15px;">Calculating metrics for Linear Regression</div>

In [23]:
#calculate metric for train data
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = np.sqrt(mse_train)

#calculate metric for validation data
mse_validation = mean_squared_error(y_validation, y_pred_validation)
rmse_validation = np.sqrt(mse_validation)

#calculate metric for test data
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)

print('Train data:')
print('MSE:', mse_train.round(3))
print('RMSE:', rmse_train.round(3))
print(5*'- - ')
print('Validation data:')
print('MSE:', mse_validation.round(3))
print('RMSE:', rmse_validation.round(3))
print(5*'- - ')
print('Test data:')
print('MSE:', mse_test.round(3))
print('RMSE:', rmse_test.round(3))

Train data:
MSE: 81268040.933
RMSE: 9014.879
- - - - - - - - - - 
Validation data:
MSE: 49340255.134
RMSE: 7024.262
- - - - - - - - - - 
Test data:
MSE: 26074591.172
RMSE: 5106.329


<div style="background-color: lightblue; color: black; padding: 10px; font-weight: bold; font-size: 15px;">Random Forest</div>

In [18]:
# instantiate model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# fit the model with train data 
model.fit(X_train, y_train)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_validation = model.predict(X_validation)
y_pred_test = model.predict(X_test)

<div style="background-color: lightblue; color: black; padding: 10px; font-weight: bold; font-size: 15px;">Calculating metrics for Random Forest</div>

In [17]:
#calculate metric for train data
mse_train = mean_squared_error(y_train, y_pred_train)
rmse_train = np.sqrt(mse_train)

#calculate metric for validation data
mse_validation = mean_squared_error(y_validation, y_pred_validation)
rmse_validation = np.sqrt(mse_validation)

#calculate metric for test data
mse_test = mean_squared_error(y_test, y_pred_test)
rmse_test = np.sqrt(mse_test)

print('Train data:')
print('MSE:', mse_train.round(3))
print('RMSE:', rmse_train.round(3))
print(5*'- - ')
print('Validation data:')
print('MSE:', mse_validation.round(3))
print('RMSE:', rmse_validation.round(3))
print(5*'- - ')
print('Test data:')
print('MSE:', mse_test.round(3))
print('RMSE:', rmse_test.round(3))

Train data:
MSE: 80260233.702
RMSE: 8958.808
- - - - - - - - - - 
Validation data:
MSE: 48913058.919
RMSE: 6993.787
- - - - - - - - - - 
Test data:
MSE: 27392750.85
RMSE: 5233.808
