In [0]:
import pandas as pd
import numpy as np
import warnings

import tensorflow as tf
import keras
from keras.layers import LeakyReLU
from keras import optimizers
from keras.utils import plot_model
from keras.models import Sequential, Model
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.layers import Dense, LSTM, Flatten
from keras.layers import CuDNNLSTM

from sklearn.model_selection import train_test_split


In [0]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test_QkPvNLx.csv')

In [0]:
train_df.shape

(512087, 11)

In [0]:
train_df.head(3)

Unnamed: 0,ID,Day_No,Course_ID,Course_Domain,Course_Type,Short_Promotion,Public_Holiday,Long_Promotion,User_Traffic,Competition_Metric,Sales
0,1,1,1,Development,Course,0,1,1,11004,0.007,81
1,2,2,1,Development,Course,0,0,1,13650,0.007,79
2,3,3,1,Development,Course,0,0,1,11655,0.007,75


In [0]:
test_df.shape

(36000, 9)

In [0]:
# Fill null values with zeroes 
train_df.Competition_Metric = train_df.Competition_Metric.fillna(0)

In [0]:
# df_course_1 = train_df.loc[train_df.Course_ID ==1]

In [0]:
def create_lag_features(df, sales_cols, columns_list, lag_days):
    temp = df.copy()
    for i in range(lag_days, 0, -1):
        temp = pd.concat([temp[columns_list],df[sales_cols].shift(i)], axis=1)
        columns_list = columns_list +[sales_col+'_t_'+str(i) for sales_col in sales_cols]
        temp.columns = columns_list
    return temp

In [0]:
original_column_list = ['ID', 'Day_No', 'Course_ID', 'Course_Domain', 'Course_Type', 'Public_Holiday', 'Competition_Metric']
sales_cols = ['Sales', 'User_Traffic', 'Long_Promotion','Short_Promotion']
lag_days = 60
train_lag_df = pd.DataFrame()
# Create lag features for each unique course
for course_id in train_df.Course_ID.unique():
    column_list = original_column_list.copy()
    temp_df = create_lag_features(train_df.loc[train_df.Course_ID ==course_id], sales_cols, column_list, lag_days)
    train_lag_df = train_lag_df.append(temp_df)
print("Created lag features for " + str(lag_days) + " days")

Created lag features for 60 days


In [0]:
# Add User traffic, sales information to lag features dataframe
train_lag_df['User_Traffic'] = train_df['User_Traffic']
train_lag_df['Sales_Today'] = train_df['Sales']

In [0]:
# Drop null values
train_lag_df = train_lag_df.dropna()

In [0]:
train_lag_df.sample(2)

Unnamed: 0,ID,Day_No,Course_ID,Course_Domain,Course_Type,Public_Holiday,Competition_Metric,Sales_t_60,User_Traffic_t_60,Long_Promotion_t_60,Short_Promotion_t_60,Sales_t_59,User_Traffic_t_59,Long_Promotion_t_59,Short_Promotion_t_59,Sales_t_58,User_Traffic_t_58,Long_Promotion_t_58,Short_Promotion_t_58,Sales_t_57,User_Traffic_t_57,Long_Promotion_t_57,Short_Promotion_t_57,Sales_t_56,User_Traffic_t_56,Long_Promotion_t_56,Short_Promotion_t_56,Sales_t_55,User_Traffic_t_55,Long_Promotion_t_55,Short_Promotion_t_55,Sales_t_54,User_Traffic_t_54,Long_Promotion_t_54,Short_Promotion_t_54,Sales_t_53,User_Traffic_t_53,Long_Promotion_t_53,Short_Promotion_t_53,Sales_t_52,...,Long_Promotion_t_10,Short_Promotion_t_10,Sales_t_9,User_Traffic_t_9,Long_Promotion_t_9,Short_Promotion_t_9,Sales_t_8,User_Traffic_t_8,Long_Promotion_t_8,Short_Promotion_t_8,Sales_t_7,User_Traffic_t_7,Long_Promotion_t_7,Short_Promotion_t_7,Sales_t_6,User_Traffic_t_6,Long_Promotion_t_6,Short_Promotion_t_6,Sales_t_5,User_Traffic_t_5,Long_Promotion_t_5,Short_Promotion_t_5,Sales_t_4,User_Traffic_t_4,Long_Promotion_t_4,Short_Promotion_t_4,Sales_t_3,User_Traffic_t_3,Long_Promotion_t_3,Short_Promotion_t_3,Sales_t_2,User_Traffic_t_2,Long_Promotion_t_2,Short_Promotion_t_2,Sales_t_1,User_Traffic_t_1,Long_Promotion_t_1,Short_Promotion_t_1,User_Traffic,Sales_Today
124478,133179,821,146,Development,Course,0,0.004,102.0,15813.0,1.0,0.0,140.0,19488.0,1.0,0.0,183.0,24717.0,1.0,1.0,161.0,23583.0,1.0,1.0,133.0,18963.0,1.0,1.0,153.0,21651.0,1.0,1.0,136.0,20328.0,1.0,1.0,114.0,17745.0,1.0,0.0,129.0,...,1.0,0.0,91.0,16275.0,1.0,0.0,90.0,16338.0,1.0,0.0,95.0,16422.0,1.0,0.0,91.0,15498.0,1.0,0.0,87.0,15309.0,1.0,0.0,103.0,17682.0,1.0,0.0,85.0,15666.0,1.0,0.0,208.0,24696.0,1.0,1.0,174.0,20979.0,1.0,1.0,24654,183
39893,42654,426,47,Development,Course,0,0.295,13.0,2373.0,1.0,0.0,146.0,17514.0,1.0,0.0,160.0,18522.0,1.0,0.0,96.0,10752.0,1.0,0.0,76.0,10038.0,1.0,0.0,310.0,26733.0,1.0,1.0,224.0,22617.0,1.0,1.0,200.0,19488.0,1.0,1.0,203.0,...,1.0,1.0,156.0,18396.0,1.0,1.0,100.0,11340.0,1.0,0.0,155.0,18165.0,1.0,0.0,133.0,16968.0,1.0,0.0,132.0,16632.0,1.0,0.0,138.0,17556.0,1.0,0.0,94.0,13062.0,1.0,0.0,151.0,18438.0,1.0,0.0,119.0,13755.0,1.0,0.0,14364,118


In [0]:
train_lag_df.shape

(476087, 249)

In [0]:
# Use test information in train i.e add lag from test because test preceeds train
derived_test_df = pd.DataFrame()
actual_training_df = pd.DataFrame()
train_target_columns = ['Short_Promotion', 'Public_Holiday', 'Long_Promotion', 'Competition_Metric', 'Sales']
train_target_append_columns = [col+'_t_+60' for col in train_target_columns if 'Sales' not in col]
for course_id in train_df.Course_ID.unique():
    train_lag_course_df = train_lag_df.loc[train_lag_df.Course_ID==course_id]
    train_course_df = train_df[train_df.Course_ID==course_id]
    train_target_df = train_course_df[train_target_columns].shift(-60)
    train_target_df.columns = train_target_append_columns + ['Sales']
    temp_actual_training_df = pd.concat([train_lag_course_df, train_target_df], axis=1)
    derived_test_df = derived_test_df.append(temp_actual_training_df[temp_actual_training_df['Sales'].isna()],
                                            verify_integrity=True)
    actual_training_df = actual_training_df.append(temp_actual_training_df.dropna(), verify_integrity=True)
    del temp_actual_training_df
    del train_target_df
    del train_course_df
    del train_lag_course_df
print("Created target for train data and derived test data")
print("Actual test data shape ", test_df.shape)
print("Derived test data from train shape ", derived_test_df.shape)

Created target for train data and derived test data
Actual test data shape  (36000, 9)
Derived test data from train shape  (36000, 254)


In [0]:
print("Checking whether course ID in derived test is matching actual test")
(derived_test_df.sort_values(by=['Course_ID','Day_No'])['Course_ID'].reset_index(drop=True)==test_df.sort_values(by=['Course_ID','Day_No'])['Course_ID'].reset_index(drop=True)).value_counts()

Checking whether course ID in derived test is matching actual test


True    36000
Name: Course_ID, dtype: int64

In [0]:
# derived_test_df_copy = derived_test_df.copy()
# derived_test_df = derived_test_df_copy.copy()

In [0]:

derived_test_columns = ['Short_Promotion', 'Public_Holiday', 'Long_Promotion', 'Competition_Metric']
for col in derived_test_columns:
    derived_test_df = derived_test_df.sort_values(by=['Course_ID','Day_No']).reset_index(drop=True)
    derived_test_df[col+'_t_+60'] = test_df.sort_values(by=['Course_ID','Day_No'])[col]
print("Derived test data is prepared")

Derived test data is prepared


In [0]:
derived_test_df.head()

Unnamed: 0,ID,Day_No,Course_ID,Course_Domain,Course_Type,Public_Holiday,Competition_Metric,Sales_t_60,User_Traffic_t_60,Long_Promotion_t_60,Short_Promotion_t_60,Sales_t_59,User_Traffic_t_59,Long_Promotion_t_59,Short_Promotion_t_59,Sales_t_58,User_Traffic_t_58,Long_Promotion_t_58,Short_Promotion_t_58,Sales_t_57,User_Traffic_t_57,Long_Promotion_t_57,Short_Promotion_t_57,Sales_t_56,User_Traffic_t_56,Long_Promotion_t_56,Short_Promotion_t_56,Sales_t_55,User_Traffic_t_55,Long_Promotion_t_55,Short_Promotion_t_55,Sales_t_54,User_Traffic_t_54,Long_Promotion_t_54,Short_Promotion_t_54,Sales_t_53,User_Traffic_t_53,Long_Promotion_t_53,Short_Promotion_t_53,Sales_t_52,...,Short_Promotion_t_9,Sales_t_8,User_Traffic_t_8,Long_Promotion_t_8,Short_Promotion_t_8,Sales_t_7,User_Traffic_t_7,Long_Promotion_t_7,Short_Promotion_t_7,Sales_t_6,User_Traffic_t_6,Long_Promotion_t_6,Short_Promotion_t_6,Sales_t_5,User_Traffic_t_5,Long_Promotion_t_5,Short_Promotion_t_5,Sales_t_4,User_Traffic_t_4,Long_Promotion_t_4,Short_Promotion_t_4,Sales_t_3,User_Traffic_t_3,Long_Promotion_t_3,Short_Promotion_t_3,Sales_t_2,User_Traffic_t_2,Long_Promotion_t_2,Short_Promotion_t_2,Sales_t_1,User_Traffic_t_1,Long_Promotion_t_1,Short_Promotion_t_1,User_Traffic,Sales_Today,Short_Promotion_t_+60,Public_Holiday_t_+60,Long_Promotion_t_+60,Competition_Metric_t_+60,Sales
0,823.0,823.0,1.0,Development,Course,1.0,0.007,147.0,16968.0,1.0,1.0,123.0,14364.0,1.0,1.0,115.0,15141.0,1.0,1.0,95.0,12663.0,1.0,1.0,95.0,11550.0,1.0,1.0,41.0,6426.0,1.0,0.0,91.0,11718.0,1.0,0.0,66.0,10311.0,1.0,0.0,83.0,...,0.0,70.0,10206.0,1.0,0.0,68.0,9618.0,1.0,0.0,54.0,7455.0,1.0,0.0,63.0,9660.0,1.0,0.0,158.0,16317.0,1.0,1.0,182.0,17850.0,1.0,1.0,116.0,15876.0,1.0,1.0,111.0,12999.0,1.0,1.0,8820.0,62.0,1,0,1,0.007,
1,824.0,824.0,1.0,Development,Course,0.0,0.007,123.0,14364.0,1.0,1.0,115.0,15141.0,1.0,1.0,95.0,12663.0,1.0,1.0,95.0,11550.0,1.0,1.0,41.0,6426.0,1.0,0.0,91.0,11718.0,1.0,0.0,66.0,10311.0,1.0,0.0,83.0,11907.0,1.0,0.0,85.0,...,0.0,68.0,9618.0,1.0,0.0,54.0,7455.0,1.0,0.0,63.0,9660.0,1.0,0.0,158.0,16317.0,1.0,1.0,182.0,17850.0,1.0,1.0,116.0,15876.0,1.0,1.0,111.0,12999.0,1.0,1.0,62.0,8820.0,1.0,1.0,10626.0,80.0,1,0,1,0.007,
2,825.0,825.0,1.0,Development,Course,0.0,0.007,115.0,15141.0,1.0,1.0,95.0,12663.0,1.0,1.0,95.0,11550.0,1.0,1.0,41.0,6426.0,1.0,0.0,91.0,11718.0,1.0,0.0,66.0,10311.0,1.0,0.0,83.0,11907.0,1.0,0.0,85.0,13461.0,1.0,0.0,67.0,...,0.0,54.0,7455.0,1.0,0.0,63.0,9660.0,1.0,0.0,158.0,16317.0,1.0,1.0,182.0,17850.0,1.0,1.0,116.0,15876.0,1.0,1.0,111.0,12999.0,1.0,1.0,62.0,8820.0,1.0,1.0,80.0,10626.0,1.0,0.0,11193.0,82.0,1,0,1,0.007,
3,826.0,826.0,1.0,Development,Course,1.0,0.007,95.0,12663.0,1.0,1.0,95.0,11550.0,1.0,1.0,41.0,6426.0,1.0,0.0,91.0,11718.0,1.0,0.0,66.0,10311.0,1.0,0.0,83.0,11907.0,1.0,0.0,85.0,13461.0,1.0,0.0,67.0,10185.0,1.0,0.0,65.0,...,0.0,63.0,9660.0,1.0,0.0,158.0,16317.0,1.0,1.0,182.0,17850.0,1.0,1.0,116.0,15876.0,1.0,1.0,111.0,12999.0,1.0,1.0,62.0,8820.0,1.0,1.0,80.0,10626.0,1.0,0.0,82.0,11193.0,1.0,0.0,2562.0,15.0,1,0,1,0.007,
4,827.0,827.0,1.0,Development,Course,0.0,0.007,95.0,11550.0,1.0,1.0,41.0,6426.0,1.0,0.0,91.0,11718.0,1.0,0.0,66.0,10311.0,1.0,0.0,83.0,11907.0,1.0,0.0,85.0,13461.0,1.0,0.0,67.0,10185.0,1.0,0.0,65.0,10479.0,1.0,0.0,50.0,...,0.0,158.0,16317.0,1.0,1.0,182.0,17850.0,1.0,1.0,116.0,15876.0,1.0,1.0,111.0,12999.0,1.0,1.0,62.0,8820.0,1.0,1.0,80.0,10626.0,1.0,0.0,82.0,11193.0,1.0,0.0,15.0,2562.0,1.0,0.0,12222.0,82.0,0,0,1,0.007,


In [0]:
# actual_training_df.reset_index(drop = True).to_csv('actual_train_new.csv',index=False)
# derived_test_df.reset_index(drop= True).to_csv('derived_test_new.csv',index=False)

In [0]:
model_train_df = actual_training_df.reset_index(drop = True)
model_test_df = derived_test_df.reset_index(drop= True)

In [0]:
def overall_preprocessing(df, is_test=False):
    df.Competition_Metric = df.Competition_Metric.fillna(0)
    df['Competition_Metric_t_+60'] = df['Competition_Metric_t_+60'].fillna(0)
    course_type = pd.get_dummies(df['Course_Type'])
    course_domain = pd.get_dummies(df['Course_Domain'])
    
    user_traffic_columns = [col for col in df.columns if 'User_Traffic' in col]
    
    df[user_traffic_columns] = df[user_traffic_columns]/100
    df_processed = pd.concat([df, course_type, course_domain], axis=1)
    df_processed['Day_No'] = df_processed['Day_No'].mod(365)
    df_processed = df_processed.drop(columns = ['ID','Course_Type','Course_Domain'])
    if is_test:
        del df_processed['Sales']
        print("Test shape: " + str(df_processed.shape))
        return df_processed
    else:
        target = df_processed[['Sales']]
        del df_processed['Sales'] 
        print("Train shape: "+str(df_processed.shape))
        return df_processed, target

In [0]:
# model_encoded_train_df, model_target_df = overall_preprocessing(model_train_df)
# model_encoded_test_df = overall_preprocessing(model_test_df, True)
# model_encoded_train_df.head(2)
# model_encoded_test_df.head(2)
# model_encoded_test_df.isnull().any().value_counts()

In [0]:
# Create train, test, cross validation splits
train_cross_val_df = pd.DataFrame()
holdout_df = pd.DataFrame()
for course_id in list(actual_training_df.Course_ID.unique()):
  temp_model_train_df = actual_training_df.loc[actual_training_df.Course_ID==course_id]
  train_cross_val_df = train_cross_val_df.append(temp_model_train_df[:-60], ignore_index=True)
  holdout_df = holdout_df.append(temp_model_train_df[-60:], ignore_index=True)    
  del temp_model_train_df

model_train_df = actual_training_df.reset_index(drop = True)
model_train_cross_val_df = train_cross_val_df.reset_index(drop=True)
model_holdout_df = holdout_df.reset_index(drop=True)
model_test_df = derived_test_df.reset_index(drop= True)

model_encoded_train_df, model_target_df = overall_preprocessing(model_train_df)
model_encoded_train_cross_val_df, model_target_train_cross_val_df = overall_preprocessing(model_train_cross_val_df)
model_encoded_holdout_df, model_target_holdout_df = overall_preprocessing(model_holdout_df)
model_encoded_test_df = overall_preprocessing(model_test_df, True)

X = model_encoded_train_cross_val_df
y = model_target_train_cross_val_df.values


Train shape: (440087, 257)
Train shape: (404087, 257)
Train shape: (36000, 257)
Test shape: (36000, 257)


In [0]:
# Save train, cross validation dataframe
pd.concat([model_encoded_train_df, model_target_df], axis=1).to_csv('model_encoded_train_df.csv', index=False)
pd.concat([model_encoded_train_cross_val_df, model_target_train_cross_val_df], axis=1).to_csv('model_encoded_train_cross_val_df.csv', index=False)

In [0]:
model_target_holdout_df.head()

Unnamed: 0,Sales
0,62.0
1,80.0
2,82.0
3,15.0
4,82.0


In [0]:
# pd.concat([model_encoded_holdout_df, model_target_holdout_df], axis=1).to_csv('model_encoded_holdout_df.csv', index=False)
# model_encoded_test_df.to_csv('model_encoded_test_df.csv', index=False)

In [0]:
print('model_encoded_train_cross_val set shape', model_encoded_train_cross_val_df.shape)
print('model_encoded_holdout set shape', model_encoded_holdout_df.shape)
X_holdout = model_encoded_holdout_df
y_holdout = model_target_holdout_df.values
X_holdout.head()

model_encoded_train_cross_val set shape (404087, 257)
model_encoded_holdout set shape (36000, 257)


Unnamed: 0,Day_No,Course_ID,Public_Holiday,Competition_Metric,Sales_t_60,User_Traffic_t_60,Long_Promotion_t_60,Short_Promotion_t_60,Sales_t_59,User_Traffic_t_59,Long_Promotion_t_59,Short_Promotion_t_59,Sales_t_58,User_Traffic_t_58,Long_Promotion_t_58,Short_Promotion_t_58,Sales_t_57,User_Traffic_t_57,Long_Promotion_t_57,Short_Promotion_t_57,Sales_t_56,User_Traffic_t_56,Long_Promotion_t_56,Short_Promotion_t_56,Sales_t_55,User_Traffic_t_55,Long_Promotion_t_55,Short_Promotion_t_55,Sales_t_54,User_Traffic_t_54,Long_Promotion_t_54,Short_Promotion_t_54,Sales_t_53,User_Traffic_t_53,Long_Promotion_t_53,Short_Promotion_t_53,Sales_t_52,User_Traffic_t_52,Long_Promotion_t_52,Short_Promotion_t_52,...,User_Traffic_t_7,Long_Promotion_t_7,Short_Promotion_t_7,Sales_t_6,User_Traffic_t_6,Long_Promotion_t_6,Short_Promotion_t_6,Sales_t_5,User_Traffic_t_5,Long_Promotion_t_5,Short_Promotion_t_5,Sales_t_4,User_Traffic_t_4,Long_Promotion_t_4,Short_Promotion_t_4,Sales_t_3,User_Traffic_t_3,Long_Promotion_t_3,Short_Promotion_t_3,Sales_t_2,User_Traffic_t_2,Long_Promotion_t_2,Short_Promotion_t_2,Sales_t_1,User_Traffic_t_1,Long_Promotion_t_1,Short_Promotion_t_1,User_Traffic,Sales_Today,Short_Promotion_t_+60,Public_Holiday_t_+60,Long_Promotion_t_+60,Competition_Metric_t_+60,Course,Degree,Program,Business,Development,Finance & Accounting,Software Marketing
0,33.0,1.0,0.0,0.007,120.0,140.7,1.0,1.0,115.0,141.96,1.0,1.0,52.0,81.06,1.0,0.0,112.0,140.7,1.0,0.0,86.0,129.36,1.0,0.0,95.0,125.16,1.0,0.0,105.0,150.57,1.0,0.0,83.0,113.19,1.0,0.0,79.0,104.37,1.0,0.0,...,133.98,1.0,1.0,95.0,120.75,1.0,1.0,109.0,151.2,1.0,1.0,86.0,113.19,1.0,1.0,106.0,131.04,1.0,1.0,55.0,67.83,1.0,0.0,84.0,114.87,1.0,0.0,169.68,147.0,1.0,1.0,1.0,0.007,1,0,0,0,1,0,0
1,34.0,1.0,0.0,0.007,115.0,141.96,1.0,1.0,52.0,81.06,1.0,0.0,112.0,140.7,1.0,0.0,86.0,129.36,1.0,0.0,95.0,125.16,1.0,0.0,105.0,150.57,1.0,0.0,83.0,113.19,1.0,0.0,79.0,104.37,1.0,0.0,60.0,73.5,1.0,0.0,...,120.75,1.0,1.0,109.0,151.2,1.0,1.0,86.0,113.19,1.0,1.0,106.0,131.04,1.0,1.0,55.0,67.83,1.0,0.0,84.0,114.87,1.0,0.0,147.0,169.68,1.0,1.0,143.64,123.0,0.0,0.0,1.0,0.007,1,0,0,0,1,0,0
2,35.0,1.0,0.0,0.007,52.0,81.06,1.0,0.0,112.0,140.7,1.0,0.0,86.0,129.36,1.0,0.0,95.0,125.16,1.0,0.0,105.0,150.57,1.0,0.0,83.0,113.19,1.0,0.0,79.0,104.37,1.0,0.0,60.0,73.5,1.0,0.0,76.0,105.42,1.0,0.0,...,151.2,1.0,1.0,86.0,113.19,1.0,1.0,106.0,131.04,1.0,1.0,55.0,67.83,1.0,0.0,84.0,114.87,1.0,0.0,147.0,169.68,1.0,1.0,123.0,143.64,1.0,1.0,151.41,115.0,0.0,0.0,1.0,0.007,1,0,0,0,1,0,0
3,36.0,1.0,0.0,0.007,112.0,140.7,1.0,0.0,86.0,129.36,1.0,0.0,95.0,125.16,1.0,0.0,105.0,150.57,1.0,0.0,83.0,113.19,1.0,0.0,79.0,104.37,1.0,0.0,60.0,73.5,1.0,0.0,76.0,105.42,1.0,0.0,188.0,190.89,1.0,1.0,...,113.19,1.0,1.0,106.0,131.04,1.0,1.0,55.0,67.83,1.0,0.0,84.0,114.87,1.0,0.0,147.0,169.68,1.0,1.0,123.0,143.64,1.0,1.0,115.0,151.41,1.0,1.0,126.63,95.0,0.0,1.0,1.0,0.007,1,0,0,0,1,0,0
4,37.0,1.0,0.0,0.007,86.0,129.36,1.0,0.0,95.0,125.16,1.0,0.0,105.0,150.57,1.0,0.0,83.0,113.19,1.0,0.0,79.0,104.37,1.0,0.0,60.0,73.5,1.0,0.0,76.0,105.42,1.0,0.0,188.0,190.89,1.0,1.0,171.0,181.86,1.0,1.0,...,131.04,1.0,1.0,55.0,67.83,1.0,0.0,84.0,114.87,1.0,0.0,147.0,169.68,1.0,1.0,123.0,143.64,1.0,1.0,115.0,151.41,1.0,1.0,95.0,126.63,1.0,1.0,115.5,95.0,0.0,0.0,1.0,0.007,1,0,0,0,1,0,0


In [0]:
import gc
del train_df, actual_training_df, train_cross_val_df, holdout_df, derived_test_df
del model_train_cross_val_df
del model_holdout_df
del model_train_df
del model_encoded_holdout_df, model_target_holdout_df
gc.collect()

0

In [0]:
X.isnull().any().value_counts()

False    257
dtype: int64

### LSTM for Time Series Forecasting

* Now the LSTM model actually sees the input data as a sequence, so it's able to learn patterns from sequenced data (assuming it exists) better than the other ones, especially patterns from long sequences.
* Input shape **[samples, timesteps, features]**.

In [0]:
df_pred_ID = test_df['ID']
#save_submission(df_pred_ID, model_mlp.predict(model_encoded_test_df).flatten(),'MLP_Time_series_70_lag_traffic_Sales_feature_added_leaky_relu')


In [0]:
def save_submission(df_pred_ID, prediction, filename):
    result = pd.concat([df_pred_ID,pd.DataFrame({'Sales':list(prediction)})],axis=1)
    result.to_csv('./' + filename + '.csv', index=False)

In [0]:
from keras.layers import CuDNNLSTM

In [0]:
epochs = 10
batch = 64
lr = 0.0002
adam = optimizers.Adam(lr)
leaky_relu_alpha =0.01

In [0]:
# Create train, validation model dataframes
X_train, X_valid, Y_train, Y_valid = train_test_split(X, y, test_size=0.3, random_state=5)
print('Train set shape', X_train.shape)
print('Validation set shape', X_valid.shape)
X_train_series = X_train.values.reshape((X_train.shape[0], X_train.shape[1], 1))
X_valid_series = X_valid.values.reshape((X_valid.shape[0], X_valid.shape[1], 1))
print('Train series shape', X_train_series.shape)
print('Validation series shape', X_valid_series.shape)
X_series = model_encoded_train_cross_val_df.values.reshape((model_encoded_train_cross_val_df.shape[0],
                                                 model_encoded_train_cross_val_df.shape[1], 1))
X_full_train = model_encoded_train_df.values.reshape((model_encoded_train_df.shape[0],
                                                 model_encoded_train_df.shape[1], 1))
test_2d = model_encoded_test_df.values.reshape((model_encoded_test_df.shape[0], model_encoded_test_df.shape[1], 1))

Train set shape (282860, 257)
Validation set shape (121227, 257)
Train series shape (282860, 257, 1)
Validation series shape (121227, 257, 1)


In [0]:
# Init model layers
model_lstm = Sequential()
model_lstm.add(CuDNNLSTM(512, input_shape=(X_train_series.shape[1], X_train_series.shape[2])))
model_lstm.add(LeakyReLU(alpha=leaky_relu_alpha))
model_lstm.add(Dense(512, kernel_initializer='normal'))
model_lstm.add(LeakyReLU(alpha=leaky_relu_alpha))
model_lstm.add(Dense(128, kernel_initializer='normal'))
model_lstm.add(LeakyReLU(alpha=leaky_relu_alpha))
model_lstm.add(Dense(32, kernel_initializer='normal'))
model_lstm.add(LeakyReLU(alpha=leaky_relu_alpha))
model_lstm.add(Dense(1))
model_lstm.compile(loss='mse', optimizer=adam, metrics=['msle'])
model_lstm.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
cu_dnnlstm_1 (CuDNNLSTM)     (None, 512)               1054720   
_________________________________________________________________
leaky_re_lu_1 (LeakyReLU)    (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 512)               262656    
_________________________________________________________________
leaky_re_lu_2 (LeakyReLU)    (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               65664     
_________________________________________________________________
leaky_re_lu_3 (LeakyReLU)    (None, 128)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)               

In [0]:
# Fit the model
epochs = 35
batch = 60
callback = [keras.callbacks.EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=0, mode='auto', restore_best_weights=True)]
lstm_history = model_lstm.fit(X_train_series, Y_train,
                              validation_data=(X_valid_series, Y_valid),
                            epochs=epochs, verbose=1, batch_size= batch, callbacks=callback)

In [0]:
# Fit on holdout
X_holdout_series = X_holdout.values.reshape((X_holdout.shape[0], X_holdout.shape[1], 1))
model_lstm.evaluate(X_holdout_series,y_holdout)



[845.2946488918728, 0.07422793656587601]

In [0]:
# Fit on validation
epochs= 10
batch = 128
lstm_history = model_lstm.fit(X_series, model_target_train_cross_val_df.values,
                              validation_data=(X_holdout_series,y_holdout),
                            epochs=epochs, verbose=1, batch_size= batch, callbacks=callback)

In [0]:
# Get holdout score
#X_holdout_series = X_holdout.values.reshape((X_holdout.shape[0], X_holdout.shape[1], 1))
model_lstm.evaluate(X_holdout_series,y_holdout)



[735.5170339287652, 0.06144694611430168]

In [0]:
# Fit on the full data available
epochs= 2
batch = 256
lstm_history = model_lstm.fit(X_full_train, model_target_df.values,
                              validation_data=(X_holdout_series,y_holdout),
                            epochs=epochs, verbose=1, batch_size= batch, callbacks=callback)

In [0]:
# Convert to submission format
save_submission(df_pred_ID, model_lstm.predict(test_2d).flatten(),'LSTM_2d_Time_series_60days_lag_with_long_shhort_promotion')