Preprocess the dataset

In [1]:
def data_preprocessing(df,holiday_events_df,oil_df,lag_days=[1,7,30],rolling_days=[7,30,60]):
    
    start_date=pd.to_datetime(df['date'].agg(['min','max'])['min'])
    end_date=pd.to_datetime(df['date'].agg(['min','max'])['max'])
    date_df = pd.DataFrame()
    date_df['date']=pd.date_range(start=start_date,end=end_date)
    unique_store_df=pd.DataFrame({'store_nbr':df['store_nbr'].unique()})
    
    date_df = cartesian(date_df,unique_store_df)
    
    df=date_df.merge(df,how='left',on=['date','store_nbr'])
    df=df[df['sales']>0.0]
    
    le=LabelEncoder()
    df['family']=le.fit_transform(df[['family']])
  
    #Create date features
    df['day_of_month']=df['date'].dt.day
    df['day_of_week']=df['date'].dt.dayofweek
    df['day_of_year']=df['date'].dt.dayofyear
    df['month']=df['date'].dt.month
    df['year']=df['date'].dt.year
    df['is_weekend']=(df['day_of_week'] > 5).astype(np.int8)
    
    #handling null values
    df['sales']=df.groupby(['store_nbr','day_of_week'])['sales'].ffill()
    
    #creating lag features
    SHIFT = 15
    for l in lag_days:
        df['lag_{}'.format(l)]=df.groupby(['store_nbr','family','day_of_week'])['sales'].transform(lambda x: x.shift(SHIFT+l)).fillna(0.0)
        
    #creating rolling features
    for r in rolling_days:
        df['rolling_mean_{}'.format(r)]=df.groupby(['store_nbr','family','day_of_week'])['sales'].transform(lambda x:x.shift(SHIFT).rolling(r,min_periods=1).mean()).fillna(0.0)
    
    #merging oil data
    oil_df['date']=pd.to_datetime(oil_df['date'])
    oil_df = oil_df.rename(columns={"dcoilwtico": "oil_price"})
    df=df.merge(oil_df,how='left',on='date')
    
    #filling in missing values
    df['oil_price']=df['oil_price'].fillna(axis=0,method='ffill')
    #to fill data for the first day we will use the mean price from the 2nd day
    oil_price=df[df['date']=='2013-01-02']['oil_price']
    oil_price=round(np.mean(oil_price))
    df['oil_price']=df['oil_price'].fillna(oil_price)
    
    #merging holiday data
    holiday_events_df['date'] = pd.to_datetime(holiday_events_df['date'])
    #holiday_events_df['type']=holiday_events_df['type'].replace(['Transfer','Additional','Bridge','Event'],'Holiday')
    #holiday_events_df=holiday_events_df.drop(['locale','locale_name','description','transferred'],axis=1)
    #holiday_events_df = holiday_events_df.rename(columns={"type": "day_type"})
    df=df.merge(holiday_events_df[['date','day_type']],how='left',on='date')
    df['day_type'].fillna(False, inplace=True)
    df['day_type']=df['day_type'].astype(bool).astype(int)
    
    
    return df

In [None]:
#Preprocessing data -
df=data_preprocessing(data,holidays,oil,lag_days = [1, 7, 14],rolling_days =  [7, 30, 60])

In [None]:
#Divide the data into train, valid and test set
df['date'] = pd.to_datetime(df['date'])
train_startdate = df['date'] >= '2014-01-01'
train_enddate = df['date'] <= '2016-12-31'
train_duration = train_startdate & train_enddate
traindata = df.loc[train_duration]

val_startdate = df['date'] >= '2017-01-01'
val_enddate = df['date'] <= '2017-03-31'
val_duration = val_startdate & val_enddate
val_data = df.loc[val_duration]

test_startdate = df['date'] >= '2017-04-01'
test_enddate = df['date'] <= '2017-04-15'
test_duration = test_startdate & test_enddate
test_data = df.loc[test_duration]


y_train = traindata.sales
X_train = traindata.drop(['sales', 'date'], axis=1)

y_val = val_data.sales
X_val = val_data.drop(['sales', 'date'], axis=1)

y_test = test_data.sales
X_test = test_data.drop(['sales', 'date'], axis=1)

In [5]:
#Generating time series from the available training data
num_feature_input = len(X_train.columns)

history_input = 30
generator = TimeseriesGenerator(X_train, y_train, length=history_input, batch_size = 1)

for i in range(len(generator)):
    x, y = generator[i]
    print('%s => %s' % (x, y))
    break

Model

In [None]:
with tf.device('/GPU:0'):
    def MultiStepLSTM_model():
        model = Sequential()
        model.add(LSTM(units = 50, activation='relu', return_sequences = True, input_shape = (history_input, num_feature_input)))
        model.add(Dropout(0.2))
        
        model.add(LSTM(units = 50))
        model.add(Dropout(0.2))
        model.add(Dense(units=1, activation = "linear"))
        return model

In [None]:
with tf.device('/GPU:0'):
    model = MultiStepLSTM_model()
    model.summary()

In [None]:
with tf.device('/GPU:0'):
    model.compile(optimizer='adam', loss='mean_squared_error', metrics = ['accuracy'])
    model.fit_generator(generator, steps_per_epoch=len(generator)/4, epochs=20, verbose=1)