Imports

Preprocessing the datasets

Training Dataset

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

In [None]:
train_path = '../input/data-storm-30/train_data.csv'
train_dataset = pd.read_csv(train_path)
train_dataset = train_dataset.drop_duplicates(keep = 'first')

cat_1_train_dataset = train_dataset.loc[train_dataset['CategoryCode'] == 'category_1']
cat_2_train_dataset = train_dataset.loc[train_dataset['CategoryCode'] == 'category_2']
cat_3_train_dataset = train_dataset.loc[train_dataset['CategoryCode'] == 'category_3']
cat_4_train_dataset = train_dataset.loc[train_dataset['CategoryCode'] == 'category_4']

In [None]:
cat_1_train_dataset.head()

In [None]:
print(cat_1_train_dataset.dtypes)
cat_1_train_dataset["DateID"] = pd.to_datetime(cat_1_train_dataset["DateID"])
print(cat_1_train_dataset.dtypes)

In [None]:
cat_1_train_dataset.sort_values(by='DateID', inplace=True)

In [None]:
cat_1_train_dataset

In [None]:
cat_1_timeseries = cat_1_train_dataset.groupby('DateID')['DailySales'].sum()
cat_1_timeseries.head()

In [None]:
cat_1_timeseries.describe()

In [None]:
cat_1_timeseries.plot(x='DateID',y='DailySales', grid = True, figsize = (20,8))

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

decompose_result_mult = seasonal_decompose(cat_1_timeseries, model="multiplicative")

trend = decompose_result_mult.trend
seasonal = decompose_result_mult.seasonal
residual = decompose_result_mult.resid

decompose_result_mult.plot();

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

decompose_result_mult = seasonal_decompose(cat_1_timeseries, model="additive")

trend = decompose_result_mult.trend
seasonal = decompose_result_mult.seasonal
residual = decompose_result_mult.resid

decompose_result_mult.plot();

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
ARMAmodel = SARIMAX(cat_1_timeseries, order = (1, 0, 1))

ARMAmodel = ARMAmodel.fit()

y_pred = ARMAmodel.get_forecast(30)
y_pred_df = y_pred.conf_int(alpha = 0.05) 
y_pred_df["Predictions"] = ARMAmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1])
#y_pred_df.index = test.index
y_pred_out = y_pred_df["Predictions"] 

In [None]:
plt.plot(y_pred_out, color='green', label = 'Predictions')
plt.legend()

In [None]:
from statsmodels.tsa.arima.model import ARIMA
ARIMAmodel = ARIMA(cat_1_timeseries, order = (2, 2, 2))
ARIMAmodel = ARIMAmodel.fit()

y_pred = ARIMAmodel.get_forecast(30)
y_pred_df = y_pred.conf_int(alpha = 0.05) 
y_pred_df["Predictions"] = ARIMAmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1])
#y_pred_df.index = test.index
y_pred_out = y_pred_df["Predictions"] 
plt.plot(y_pred_out, color='Yellow', label = 'ARIMA Predictions')
plt.legend()

In [None]:
SARIMAXmodel = SARIMAX(cat_1_timeseries, order = (5, 4, 2), seasonal_order=(2,2,2,12))
SARIMAXmodel = SARIMAXmodel.fit()

y_pred = SARIMAXmodel.get_forecast(30)
y_pred_df = y_pred.conf_int(alpha = 0.05) 
y_pred_df["Predictions"] = SARIMAXmodel.predict(start = y_pred_df.index[0], end = y_pred_df.index[-1])
#y_pred_df.index = test.index
y_pred_out = y_pred_df["Predictions"] 
plt.plot(y_pred_out, color='Blue', label = 'SARIMA Predictions')
plt.legend()

In [None]:
cat_2_train_dataset["DateID"] = pd.to_datetime(cat_2_train_dataset["DateID"])
cat_2_train_dataset.sort_values(by='DateID', inplace=True)
cat_2_timeseries = cat_2_train_dataset.groupby('DateID')['DailySales'].sum()
cat_2_timeseries.describe()

In [None]:
cat_2_timeseries.plot(x='DateID',y='DailySales')

In [None]:
cat_3_train_dataset["DateID"] = pd.to_datetime(cat_3_train_dataset["DateID"])
cat_3_train_dataset.sort_values(by='DateID', inplace=True)
cat_3_timeseries = cat_3_train_dataset.groupby('DateID')['DailySales'].sum()
cat_3_timeseries.describe()

In [None]:
cat_3_timeseries.plot(x='DateID',y='DailySales')

In [None]:
cat_4_train_dataset["DateID"] = pd.to_datetime(cat_4_train_dataset["DateID"])
cat_4_train_dataset.sort_values(by='DateID', inplace=True)
cat_4_timeseries = cat_4_train_dataset.groupby('DateID')['DailySales'].sum()
cat_4_timeseries.describe()

In [None]:
cat_4_timeseries.plot(x='DateID',y='DailySales')

In [None]:
from pandas_profiling import ProfileReport
profile = ProfileReport(cat_1_timeseries, title='Summary report')
profile.to_notebook_iframe()

Validation Dataset

In [None]:
valid_path = '../input/data-storm-30/validation_data.csv'
validation_dataset = pd.read_csv(valid_path)
validation_dataset = validation_dataset.drop_duplicates(keep = 'first')

cat_1_valid_dataset = validation_dataset.loc[validation_dataset['CategoryCode'] == 'category_1']
cat_2_valid_dataset = validation_dataset.loc[validation_dataset['CategoryCode'] == 'category_2']
cat_3_valid_dataset = validation_dataset.loc[validation_dataset['CategoryCode'] == 'category_3']
cat_4_valid_dataset = validation_dataset.loc[validation_dataset['CategoryCode'] == 'category_4']

Test Dataset

In [None]:
test_path = '../input/data-storm-30/test_data.csv'
test_dataset = pd.read_csv(test_path)
test_dataset = test_dataset.drop_duplicates(keep = 'first')

cat_1_test_dataset = test_dataset.loc[test_dataset['CategoryCode'] == 'category_1']
cat_2_test_dataset = test_dataset.loc[test_dataset['CategoryCode'] == 'category_2']
cat_3_test_dataset = test_dataset.loc[test_dataset['CategoryCode'] == 'category_3']
cat_4_test_dataset = test_dataset.loc[test_dataset['CategoryCode'] == 'category_4']