In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import mean_squared_error, root_mean_squared_error, mean_absolute_error

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

## Creating Necessary Functions

In [2]:
def preprocess(df):
    """
    Preprocess the data removing uncessary data and setting elements to their correct types
    Handle Null values by backward fill null values
    Creating Features for our model training
    """
    if 'Unnamed: 0' in df.columns:
        df.drop(['Unnamed: 0'], axis=1, inplace=True)#remove empty column
    if df.index.name != 'Date':    
        df['Date'] = pd.to_datetime(df['Date']) #update column from string to datetime
        df.set_index('Date', inplace=True) 
        df = df.sort_index()
    df.fillna(method='bfill', inplace=True)
    return df

In [3]:
def feature_engineering(df):
    """
    Creating new features and concatentating other features
    Also using MinMaxScaler to Normalize the features between 0 and 1 to ensure large values are not weighted higher. 
    """
    df['Month'] = df.index.month #create a month feature to account for seasonality
    df['Week'] = df.index.isocalendar().week #create a week feature to account for seasonality
    df['Year'] = df.index.year
    df['Quarter'] = df.index.quarter 

    features_to_scale = ['Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment', 'Size']
    scaler = MinMaxScaler()
    df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
    df['Size_Type'] = df['Size'] * df['Type'] #concatenating the 2 highly correlated features in to one.

    return df

In [4]:
def add_lags(df):
    target_map = df['Weekly_Sales'].to_dict()
    df['lag1'] = (df.index - pd.Timedelta(weeks=1)).map(target_map) #1 week back
    df['lag2'] = (df.index - pd.Timedelta(weeks=5)).map(target_map) #5 weeks back (1 month Approx)
    df['lag3'] = (df.index - pd.Timedelta(weeks=52)).map(target_map) #1 year back
    return df

## Data Preprocessing
- Use the functions that we defined above to preprocess data, add features and add lags features
- Define Features and Target for the Model
- Use a simple train_test_split to get train, validation, and test sets for our model training

In [31]:
df = pd.read_csv('../data/walmart_data.csv')
df = preprocess(df)
df = feature_engineering(df)
df = add_lags(df)

  df.fillna(method='bfill', inplace=True)


In [43]:
df.index.unique()
df.groupby('Date').size().mean()

2948.041958041958

In [None]:
features = ['Store', 'Size_Type', 'Dept', 'IsHoliday', 'CPI', 'Unemployment', 'Month', 'Week', 'Year', 'Quarter', 'lag1', 'lag2', 'lag3']
target = 'Weekly_Sales'

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(df[features], df[target], test_size=0.2, shuffle=False)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, shuffle=False)

## Creating the LSTM Model

In [None]:
nn_model = Sequential()
nn_model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
nn_model.add(Dropout(0.2))
nn_model.add(LSTM(units=50, return_sequences=False))
nn_model.add(Dropout(0.2))
nn_model.add(Dense(units=1, activation='linear'))

nn_model.compile(optimizer='adam', loss='mean_squared_error')

#### TODO:
- Need to sort out the input shape issues when creating the LSTM
- This is happening due to the nature of the dataset having multiple entries per store and per department at each point in time

## Training The Model
- Train with training and validation set
- Test with Test set

In [None]:
history = nn_model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_val, y_val))

In [None]:
loss = nn_model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')