![forecasting_framework](../forecasting_framework.png)

In [16]:
import pandas as pd
# import statsmodels.api as sm
import matplotlib.pyplot as plt

from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
TRAIN_CSV_PATH = r"../datasets/train.csv"
PRICES_CSV_PATH = r"../datasets/prices.csv"
CAL_CSV_PATH = r"../datasets/calendar.csv"

In [3]:
df_train = pd.read_csv(TRAIN_CSV_PATH)
df_train.head()

Unnamed: 0,id,item_id,subcat_id,category_id,store_id,region_id,d_1,d_2,d_3,d_4,...,d_1910,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919
0,Beauty_1_001_East_1,Beauty_1_001,Beauty_1,Beauty,East_1,East,0,0,0,0,...,3,0,1,1,0,0,0,2,0,3
1,Beauty_1_002_East_1,Beauty_1_002,Beauty_1,Beauty,East_1,East,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,Beauty_1_003_East_1,Beauty_1_003,Beauty_1,Beauty,East_1,East,0,0,0,0,...,0,1,1,1,0,0,1,1,0,2
3,Beauty_1_004_East_1,Beauty_1_004,Beauty_1,Beauty,East_1,East,0,0,0,0,...,1,3,7,2,0,0,1,2,4,1
4,Beauty_1_005_East_1,Beauty_1_005,Beauty_1,Beauty,East_1,East,0,0,0,0,...,2,2,2,4,1,0,2,3,1,0


In [4]:
# Convert to appropriate datatypes
df_train["id"] = df_train["id"].astype("string")
df_train["item_id"] = df_train["item_id"].astype("string")
df_train["subcat_id"] = df_train["subcat_id"].astype("string")
df_train["category_id"] = df_train["category_id"].astype("string")
df_train["store_id"] = df_train["store_id"].astype("string")
df_train["region_id"] = df_train["region_id"].astype("string")

In [6]:
df_cal = pd.read_csv(CAL_CSV_PATH)
df_cal.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d
0,2011-01-29,11101,Saturday,1,1,2011,d_1
1,2011-01-30,11101,Sunday,2,1,2011,d_2
2,2011-01-31,11101,Monday,3,1,2011,d_3
3,2011-02-01,11101,Tuesday,4,2,2011,d_4
4,2011-02-02,11101,Wednesday,5,2,2011,d_5


In [7]:
# Change dtypes
df_cal["date"] = pd.to_datetime(df_cal["date"])
df_cal["weekday"] = df_cal["weekday"].astype("string")
df_cal["d"] = df_cal["d"].astype("string")
df_cal["wm_yr_wk"] = df_cal["wm_yr_wk"].astype(int)

# Add column with months in string
month_names_ls = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
month_int_ls = list(range(1, 13))
month_names_dict = {month_int_ls[i]: month_names_ls[i] for i in range(len(month_int_ls))}

df_cal["month_name"] = df_cal["month"].replace(month_names_dict)

In [8]:
# Save day IDs from train
day_ids = list(df_train.columns[6:])

df_items_temp = df_train[["id"]+day_ids].T
df_items_temp.columns = df_items_temp.iloc[0, :]

# Drop first row
df_items_temp = df_items_temp.iloc[1: , :]

# Copy the index to merge date
df_items_temp["d"] = df_items_temp.index

# Merge date into df
df_items_temp = pd.merge(df_items_temp, df_cal, on = "d", how = "left")
df_items_temp = df_items_temp.set_index("date")

# Calculate average items across items per day
df_items_temp["avg_items_sold"] = df_items_temp[list(df_train["id"])].mean(axis = 1)

In [9]:
df_items_temp.head()

Unnamed: 0_level_0,Beauty_1_001_East_1,Beauty_1_002_East_1,Beauty_1_003_East_1,Beauty_1_004_East_1,Beauty_1_005_East_1,Beauty_1_006_East_1,Beauty_1_007_East_1,Beauty_1_008_East_1,Beauty_1_009_East_1,Beauty_1_010_East_1,...,Food_3_826_West_3,Food_3_827_West_3,d,wm_yr_wk,weekday,wday,month,year,month_name,avg_items_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-01-29,0,0,0,0,0,0,0,12,2,0,...,0,0,d_1,11101,Saturday,1,1,2011,Jan,1.07022
2011-01-30,0,0,0,0,0,0,0,15,0,0,...,0,0,d_2,11101,Sunday,2,1,2011,Jan,1.041292
2011-01-31,0,0,0,0,0,0,0,0,7,1,...,0,0,d_3,11101,Monday,3,1,2011,Jan,0.780026
2011-02-01,0,0,0,0,0,0,0,0,3,0,...,0,0,d_4,11101,Tuesday,4,2,2011,Feb,0.833454
2011-02-02,0,0,0,0,0,0,0,0,0,0,...,0,0,d_5,11101,Wednesday,5,2,2011,Feb,0.627944


Copying the code from [here](https://machinelearningmastery.com/arima-for-time-series-forecasting-with-python/)

In [18]:
# Split into test and train
x = df_items_temp["avg_items_sold"].values
size = int(len(x) * 0.8)
x_train, x_test = x[0:size], x[size:len(x)]
history = [x for x in x_train]
predictions = list()

In [19]:
for t in range(len(x_test)):
    
    # Order taken from eda (hypothesis after looking at plots) 
    print(t)
    model = ARIMA(history, order=(11,7,8))
    model_fit = model.fit()
    output = model_fit.forecast()
    yhat = output[0]
    predictions.append(yhat)
    obs = x_test[t]
    history.append(obs)
    # print('predicted=%f, expected=%f' % (yhat, obs))

  warn('Non-invertible starting MA parameters found.'


predicted=1.473821, expected=1.131322




predicted=1.202362, expected=1.101902




predicted=1.060877, expected=1.137750




predicted=1.093107, expected=1.030863


In [None]:
rmse = sqrt(mean_squared_error(x_test, predictions))
print('Test RMSE: %.3f' % rmse)

In [None]:
plt.plot(x_test)
plt.plot(predictions, color='red')
plt.show()