# AML Project - Time Series Forecasting

## Data Stuff

### 1. Utility / Loading Data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



In [2]:
path = '/kaggle/input/store-sales-time-series-forecasting/'

oil = pd.read_csv(path + 'oil.csv')
holidays = pd.read_csv(path +'holidays_events.csv')
stores = pd.read_csv(path + 'stores.csv')
train = pd.read_csv(path + 'train.csv')
transactions = pd.read_csv(path + 'transactions.csv')
test = pd.read_csv(path + 'test.csv')



## Process Data




In [3]:
#change dtype of date column to datetime
oil['date'] = pd.to_datetime(oil['date'])
holidays['date'] = pd.to_datetime(holidays['date'])
train['date']=pd.to_datetime(train ['date'])
transactions['date'] = pd.to_datetime(transactions['date'])
test['date'] = pd.to_datetime(test['date'])

In [4]:
from sklearn.preprocessing import LabelEncoder
#expand oil to include all dates + interpolate missing data
oil = oil.set_index('date').asfreq('D').reset_index()
oil['dcoilwtico'] = oil['dcoilwtico'].interpolate('linear').ffill().bfill()
train = train.merge(oil)
train = train.rename(columns={"dcoilwtico": "oilprice"})


#change family names & type to numeric values
encoder_family = LabelEncoder()
train['family_id']=encoder_family.fit_transform(train['family'])


#split up date into multiple informations
train['day'] = train['date'].dt.day
train['month'] = train['date'].dt.month
train['weekday'] = train['date'].dt.dayofweek
train['year'] = train['date'].dt.year

#remove noise - half a year after earthquake
to_drop = train.loc[train['date'].between('2016-04-16', '2016-10-16')]
train = train.drop(to_drop.index)

In [5]:
#short error handling (only national holidays)

holidays_short = holidays.set_index('date').sort_index()
holidays_short = holidays_short[holidays_short.locale == 'National'] 
holidays_short = holidays_short.groupby(holidays_short.index).first()[['type', 'transferred']]


holidays_short

#add workday column
calendar = pd.DataFrame(index=pd.date_range('2013-01-01', '2017-08-31'))
calendar['weekday'] = calendar.index.dayofweek


calendar['holiday'] = False

calendar.loc[calendar.holiday > 4, 'holiday'] = True
calendar = calendar.merge(holidays_short, how = 'left', left_index=True, right_index=True)


free = ['Bridge', 'Transfer', 'Holiday']
calendar.loc[calendar.type.isin(free), 'holiday'] = True
calendar.loc[calendar.type == 'Work Day', 'holiday'] = False
#exception: Transferred holidays
calendar.loc[(calendar.type == 'Holiday') & (calendar.transferred == True), 'holiday'] = False

train = train.merge(calendar['holiday'], left_on='date',  right_index=True)
train  = train.merge(stores[['store_nbr', 'type', 'cluster']], how = 'left', left_on= 'store_nbr', right_on = 'store_nbr')

test = test.merge(calendar['holiday'], left_on='date',  right_index=True)
test  = test.merge(stores[['store_nbr', 'type', 'cluster']], how = 'left', left_on= 'store_nbr', right_on = 'store_nbr')

encoder_type = LabelEncoder()
train['type']=encoder_type.fit_transform(train['type'])
test['type']=encoder_type.fit_transform(test['type'])

train

In [6]:
# preprocess test data
test = test.merge(oil)
test = test.rename(columns={"dcoilwtico": "oilprice"})



#change family names & type to numeric values
test['family_id']=encoder_family.fit_transform(test['family'])



#split up date into multiple informations
test['day'] = test['date'].apply(lambda time: time.day)
test['month'] = test['date'].apply(lambda time: time.month)
test['weekday'] = test['date'].apply(lambda time: time.dayofweek)
test['year'] = test['date'].apply(lambda time: time.year)

## Model

In [7]:
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Flatten
from keras.layers.convolutional import Conv1D, MaxPooling1D


cols = ['store_nbr', 'onpromotion', 'oilprice', 'holiday', 'weekday', 'day', 'month', 'type', 'cluster', 'family_id']
X = train[cols].values
Y = train['sales'].values.ravel()

In [8]:
n_features = len(cols)

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() 

print("REPLICAS: ", strategy.num_replicas_in_sync)

filter_size = 16
kernel_size = 4
pool_size = 4
    
def CNN_Model(features, output = 1):
    shape = (features,1 )
    input_layer = keras.layers.Input(shape)

    conv1 = keras.layers.Conv1D(filters=filter_size, kernel_size=kernel_size, padding="same")(input_layer)
    conv1 = keras.layers.BatchNormalization()(conv1)
    conv1 = keras.layers.ReLU()(conv1)
    conv1 = keras.layers.Dropout(0.2)(conv1)
    
    conv2 = keras.layers.Conv1D(filters=filter_size, kernel_size=kernel_size, padding="same")(conv1)
    conv2 = keras.layers.BatchNormalization()(conv2)
    conv2 = keras.layers.ReLU()(conv2)
    conv2 = keras.layers.Dropout(0.2)(conv2)

    conv3 = keras.layers.Conv1D(filters=filter_size, kernel_size=kernel_size, padding="same")(conv2)
    conv3 = keras.layers.BatchNormalization()(conv3)
    conv3 = keras.layers.ReLU()(conv3)
    conv3 = keras.layers.Dropout(0.2)(conv3)
    
    gap = keras.layers.GlobalAveragePooling1D()(conv3)
    output_layer = keras.layers.Dense(output, activation="softplus")(gap)
    
    model = keras.models.Model(inputs=input_layer, outputs=output_layer)

    model.compile(optimizer='adam', loss="mean_squared_logarithmic_error", metrics=[tf.keras.metrics.RootMeanSquaredError()])
    return model
from sklearn import preprocessing

def scale(X,Y):
  scaler = preprocessing.StandardScaler().fit(X)
  X_scaled = scaler.transform(X)
  Y_scaled = scaler.transform(Y)
  return X_scaled, Y_scaled

## Testing

In [9]:
X_train = train.loc[train['date'] < '2017-08-01'].copy()
X_test = train.loc[train['date'] >='2017-08-01'].copy()
Y_train = train.loc[train['date'] <'2017-08-01'].copy()
Y_test = train.loc[train['date'] >= '2017-08-01'].copy()

In [14]:
# add trend column

from sklearn.linear_model import LinearRegression
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

store_nbr = train['store_nbr'].max()

X_train['trend'] = 0
X_train['trend_store'] = 0
X_train['trend_family'] = 0
X_test['trend'] = 0
X_test['trend_store'] = 0
X_test['trend_family'] = 0

def get_trend(df, predict = 17, start_date = '2017-08-15', end_date = '2017-08-31'):
    dp = DeterministicProcess(
          index=df['date'],  # dates from the training data
          constant=True,  # the intercept
          order=5, 
          drop=True,      # drop terms to avoid collinearity
      )
    trend_train = dp.in_sample()
    forecast_index  = pd.date_range(start_date, end_date, freq = 'D')
    trend_test = dp.out_of_sample(steps = predict,forecast_index = forecast_index)

    return trend_train, trend_test


#global trend
trend_train, trend_test = get_trend(X_train[['date', 'sales']])
model = LinearRegression(fit_intercept=False)
model.fit(trend_train, X_train['sales'].values)

y_fit = pd.DataFrame(
  model.predict(trend_train),
  index=X_train['date'],
  columns=['sales'],
)

dates_to_predict = pd.date_range('2017-08-15', '2017-08-31', freq = 'D')
y_pred = pd.DataFrame(
  model.predict(trend_test),
  index=dates_to_predict,
  columns=['sales'],
)


X_train['trend'] = y_fit.values
y_pred = y_pred.values[:,0]


for count, date in enumerate(dates_to_predict):
    idx = (X_test['date'] == date) 
    X_test.loc[idx,'trend'] =  y_pred[count]


    
#trend families
for fam in range(family_nbr + 1):
    idx_train = X_train['family_id']== fam
    idx_test = X_test['family_id']== fam
    
    trend_train, trend_test = get_trend(X_train[idx_train][['date', 'sales']])
    model = LinearRegression(fit_intercept=False)
    model.fit(trend_train, X_train[idx_train]['sales'].values)

    y_fit = pd.DataFrame(
      model.predict(trend_train),
      index=X_train[idx_train]['date'],
      columns=['sales'],
    )

    y_pred = pd.DataFrame(
      model.predict(trend_test),
      index=dates_to_predict,
      columns=['sales'],
    )


    X_train.loc[idx_train,'trend_family'] = y_fit.values
    y_pred = y_pred.values[:,0]
    

    for count, date in enumerate(dates_to_predict):
        idx = (X_test['date'] == date) & (X_test['family_id'] == fam)
        X_test.loc[idx,'trend_family'] =  y_pred[count]
        

#trend stores
for store in range(1, store_nbr + 1):
    idx_train = X_train['store_nbr']== store
    idx_test = X_test['store_nbr']== store
    
    trend_train, trend_test = get_trend(X_train[idx_train][['date', 'sales']])
    model = LinearRegression(fit_intercept=False)
    model.fit(trend_train, X_train[idx_train]['sales'].values)

    y_fit = pd.DataFrame(
      model.predict(trend_train),
      index=X_train[idx_train]['date'],
      columns=['sales'],
    )

    y_pred = pd.DataFrame(
      model.predict(trend_test),
      index=dates_to_predict,
      columns=['sales'],
    )


    X_train.loc[idx_train,'trend_store'] = y_fit.values
    y_pred = y_pred.values[:,0]
    

    for count, date in enumerate(dates_to_predict):
        idx = (X_test['date'] == date) & (X_test['store_nbr'] == store)
        X_test.loc[idx,'trend_store'] =  y_pred[count]


    

X_test

In [None]:
from sklearn.metrics import mean_squared_log_error

family_nbr = train['family_id'].max()
cols = ['onpromotion', 'oilprice', 'holiday', 'weekday', 'day', 'month', 'cluster', 'store_nbr', 'trend', 'trend_family', 'trend_store']
n_features = len(cols)
X_test['sales'] = 0


for fam in range(family_nbr + 1):
    idx_train = X_train['family_id']== fam
    idx_test = X_test['family_id']== fam
    X_1 = X_train.loc[idx_train][cols].values
    X_2 = X_test[idx_test][cols].values
    Y = X_train.loc[idx_train]['sales'].values.ravel()
      
    X_1, X_2 = scale(X_1, X_2)
    
    X_1 = X_1.reshape(-1, n_features,1)
    X_2 = X_2.reshape(-1, n_features,1 )
    
    model = CNN_Model(len(cols))
    
    
    
    model.fit(X_1, Y, epochs=10, verbose=0)
    pred = model.predict(X_2)
    pred = pred.reshape(pred.shape[0])

    X_test.loc[idx_test,'sales'] = pred
    print('group ', fam, ': ', "{:10.4f}".format(mean_squared_log_error(Y_test[idx_test]['sales'].values, pred, squared=False)))
print('total: ', np.sqrt(mean_squared_log_error(Y_test['sales'].values, X_test['sales'].values)))

#before: 1.05