# Data Mounting and Importing

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import pandas as pd
final_df_total = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/EC Infosolutions Challenge/New_Restaurant.csv',parse_dates=['dts'])


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from tensorflow.keras.models import load_model

import time

%matplotlib inline

  import pandas.util.testing as tm


## Encoding

In [4]:
from sklearn import linear_model
import sklearn.metrics as sklm
from sklearn.metrics import mean_squared_error as rmse
import numpy as np
import numpy.random as nr
import scipy.stats as ss
import math

In [5]:
print(final_df_total['shift'].unique())
Final_Features = final_df_total['shift']
enc = preprocessing.LabelEncoder()
enc.fit(Final_Features)
Final_Features = enc.transform(Final_Features)
print(Final_Features)

['lunch' 'dinner']
[1 1 1 ... 0 0 0]


In [6]:
ohe = preprocessing.OneHotEncoder()
encoded = ohe.fit(Final_Features.reshape(-1,1))
print(Final_Features.reshape(-1,1))
Final_Features = encoded.transform(Final_Features.reshape(-1,1)).toarray()
Final_Features[:10,:]

[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]


array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [7]:
def encode_string(cat_feature):
    ## First encode the strings to numeric categories
    enc = preprocessing.LabelEncoder()
    enc.fit(cat_feature)
    enc_cat_feature = enc.transform(cat_feature)
    ## Now, apply one hot encoding
    ohe = preprocessing.OneHotEncoder()
    encoded = ohe.fit(enc_cat_feature.reshape(-1,1))
    return encoded.transform(enc_cat_feature.reshape(-1,1)).toarray()
    

categorical_columns = ['weekday', 'party_size','menu_item']

for col in categorical_columns:
    temp = encode_string(final_df_total[col])
    Final_Features = np.concatenate([Final_Features, temp], axis = 1)
    #print(Features)

print(Final_Features.shape)
print(Final_Features[:2, :])    

(60007, 57)
[[0. 1. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [8]:
Final_Features = np.concatenate([Final_Features, np.array(final_df_total[["item_price"]])], axis = 1)
Final_Features[:2,:]

array([[ 0. ,  1. ,  0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  3.5],
       [ 0. ,  1. ,  0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ,  1. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  1. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,  0. ,
         0. ,  0. , 16. ]])

## Scaling

In [9]:
scaler = preprocessing.StandardScaler().fit(Final_Features[:,57:])
Final_Features[:,57:] = scaler.transform(Final_Features[:,57:])
print(Final_Features.shape)
Final_Features[:57,:]

(60007, 58)


array([[ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.        , -1.19839166],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.        ,  0.91234308],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.        ,  0.7434843 ],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.        , -0.43852715],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.        , -1.32503575],
       [ 0.        ,  1.        ,  0.        , ...,  0.        ,
         0.        , -0.94510349]])

## Train Test Splitting

In [10]:
from sklearn.model_selection import train_test_split
nr.seed(9922)
labels = np.array(final_df_total['item_qty'])
indx = range(Final_Features.shape[0])
indx = train_test_split(indx, test_size = 0.25)
# print(indx)
x_train = Final_Features[indx[0],:]
y_train = np.ravel(labels[indx[0]])
x_test = Final_Features[indx[1],:]
y_test = np.ravel(labels[indx[1]])

Function to print metrics

In [11]:
def print_metrics(y_true, y_predicted, n_parameters):
    ## First compute R^2 and the adjusted R^2
    r2 = sklm.r2_score(y_true, y_predicted)
    r2_adj = r2 - (n_parameters - 1)/(y_true.shape[0] - n_parameters) * (1 - r2)
    
    
    ## Print the usual metrics and the R^2 values
    print('Mean Square Error      = ' + str(sklm.mean_squared_error(y_true, y_predicted)))
    print('Root Mean Square Error = ' + str(math.sqrt(sklm.mean_squared_error(y_true, y_predicted))))
    print('Mean Absolute Error    = ' + str(sklm.mean_absolute_error(y_true, y_predicted)))
    print('Median Absolute Error  = ' + str(sklm.median_absolute_error(y_true, y_predicted)))
    print('R^2                    = ' + str(r2))
    print('Adjusted R^2           = ' + str(r2_adj))
    
    return sklm.mean_squared_error(y_true, y_predicted), r2
   


# Applying LSTM

In [12]:
x_train.shape

(45005, 58)

In [13]:
X_train = x_train.reshape((x_train.shape[0], 1, x_train.shape[1]))
X_test = x_test.reshape((x_test.shape[0], 1, x_test.shape[1]))
X_train.shape

(45005, 1, 58)

In [14]:
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers import Dense

model_results = []

def LSTM_model(X_train, y_train, X_test, y_test,batch_size_= 12, dropout_=.3,epochs_=10, samples=24):
    

    model = Sequential()                                                            
    model.add(LSTM(int(samples), input_shape=(X_train.shape[1], X_train.shape[2]))) 
                                                     
    model.add(Dense(1))                                                             
    model.compile(loss='mean_squared_error', optimizer='adam')                      
                                                                                    
    # fit network                                                                   
    history = model.fit(X_train,                                                    
                        y_train,                                                    
                        epochs=epochs_,                                             
                        batch_size=batch_size_,                                     
                        validation_data=(X_test, y_test),                           
                        verbose=1,                                                  
                        shuffle=False)                                              
                                                                                    
    # Save model for later                                                          
    #filename = str(int(time.time())) + '_model_' + str(samples) + '_lag.h5'         
    #model.save('./models/' + filename)                                              
                                                                                    
    #################################################################################

    # uncomment to load 
    #model = load_model('./models/1539911055_model_672_lag.h5')

    
    
    y_score = model.predict(X_test) 
    y_score = [x[0] if x > 0 else 0 for x in y_score]
    round_score = [round(num) for num in y_score]
    (rmse, r2) = print_metrics(y_test, y_score, 28)
    
       
    
    # Save all the results
    
    model_results_dict = {}

    model_results_dict['Algo'] = 'LSTM'
    model_results_dict['lag'] = samples
    model_results_dict['dropout'] = dropout_
    model_results_dict['epochs'] = epochs_
    model_results_dict['batch_size'] = batch_size_
    model_results_dict['loss'] = history.history
    model_results_dict['rmse'] = rmse
    #model_results_dict['power_gen_rmse'] = math.sqrt(rmse)
    model_results_dict['r2'] = r2
    #model_results_dict['model_filename'] = filename
    model_results_dict['time_ran'] = int(time.time())
    model_results.append(model_results_dict)
    
    return model

model = LSTM_model(X_train,y_train, X_test,y_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean Square Error      = 0.057538602286503544
Root Mean Square Error = 0.23987205399233888
Mean Absolute Error    = 0.12056055920884667
Median Absolute Error  = 0.02922821044921875
R^2                    = 0.8448823983969891
Adjusted R^2           = 0.844602701906854


In [15]:
batch_size_ = 12
dropout_ =0.33
epochs_ = 10
samples = 168

LSTM_model(X_train,y_train, X_test,y_test, batch_size_, dropout_, epochs_, samples)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Mean Square Error      = 0.0578067430159237
Root Mean Square Error = 0.2404303288188154
Mean Absolute Error    = 0.12211708126743863
Median Absolute Error  = 0.030082285404205322
R^2                    = 0.8441595211426438
Adjusted R^2           = 0.8438785212141579


<tensorflow.python.keras.engine.sequential.Sequential at 0x7fe4e192df60>

In [16]:
model_results

[{'Algo': 'LSTM',
  'batch_size': 12,
  'dropout': 0.3,
  'epochs': 10,
  'lag': 24,
  'loss': {'loss': [0.14819280803203583,
    0.059468332678079605,
    0.05881808325648308,
    0.05861392989754677,
    0.058465514332056046,
    0.05834463611245155,
    0.05824235826730728,
    0.05815388634800911,
    0.058076027780771255,
    0.05800662934780121],
   'val_loss': [0.06017571687698364,
    0.058432240039110184,
    0.058231376111507416,
    0.058085180819034576,
    0.05796298012137413,
    0.05785628780722618,
    0.05776207149028778,
    0.05767849460244179,
    0.05760449543595314,
    0.05753860995173454]},
  'r2': 0.8448823983969891,
  'rmse': 0.057538602286503544,
  'time_ran': 1600413020},
 {'Algo': 'LSTM',
  'batch_size': 12,
  'dropout': 0.33,
  'epochs': 10,
  'lag': 168,
  'loss': {'loss': [0.11336004734039307,
    0.060712940990924835,
    0.059943463653326035,
    0.05946462228894234,
    0.05912057310342789,
    0.058862052857875824,
    0.058654457330703735,
    0.058

In [17]:
new_res_df = pd.DataFrame(model_results)
# old_res_df = pd.read_csv('result.csv', index_col=0)
# res_df = pd.concat([old_res_df, new_res_df], axis=0, sort=False).reset_index(drop=True)
new_res_df.to_csv('result.csv')

In [18]:
models_by_r2 = new_res_df.sort_values(by='r2', ascending=False)
models_by_r2.head()

Unnamed: 0,Algo,lag,dropout,epochs,batch_size,loss,rmse,r2,time_ran
0,LSTM,24,0.3,10,12,"{'loss': [0.14819280803203583, 0.0594683326780...",0.057539,0.844882,1600413020
1,LSTM,168,0.33,10,12,"{'loss': [0.11336004734039307, 0.0607129409909...",0.057807,0.84416,1600413135


In [19]:
start_date = '2019-07-01'
end_date = '2019-07-07'
mask = (final_df_total.dts >= start_date) & (final_df_total.dts <= end_date)
df_mask = final_df_total.loc[mask]
df_mask

Unnamed: 0,dts,shift,weekday,ticket_code,party_size,menu_category,menu_item,item_price,item_qty
2076,2019-07-01 11:30:00,lunch,1,YALE0019,5,Starter,GOBI MANCHURIAN,14.0,2
2077,2019-07-01 11:30:00,lunch,1,YALE0016,3,VEGETABLE SPECIALS,BAINGAN BARTHA,16.0,1
2078,2019-07-01 11:35:00,lunch,1,YALE0019,5,BREADS,GARLIC NAAN,3.5,1
2079,2019-07-01 11:35:00,lunch,1,GRHUB002,1,RICE SPECIALS,CHICKEN BIRYANI,19.0,2
2080,2019-07-01 11:35:00,lunch,1,YALE0013,3,DESSERTS,RASMALAI,6.0,1
...,...,...,...,...,...,...,...,...,...
52034,2019-07-06 22:00:00,dinner,5,YALE0006,3,VEGETABLE SPECIALS,SHAHI PANEER,16.0,1
52035,2019-07-06 22:00:00,dinner,5,YALE0014,5,DESSERTS,CARROT HALWA,7.0,1
52036,2019-07-06 22:00:00,dinner,5,GRHUB002,1,BREADS,GARLIC NAAN,3.5,2
52037,2019-07-06 22:00:00,dinner,5,YALE0019,5,BREADS,GARLIC NAAN,3.5,2


# Final Output

In [20]:
pvt = df_mask.pivot_table(values='item_qty', index='menu_item', columns='shift',aggfunc=sum)
pvt.sort_values(by=['dinner','lunch'], ascending=False)

shift,dinner,lunch
menu_item,Unnamed: 1_level_1,Unnamed: 2_level_1
GARLIC NAAN,179,150
BUTTER CHICKEN,117,46
NAAN,94,58
RICE,77,36
TANDOORI ROTI,59,28
MALAI KOFTA,58,44
FISH CURRY,53,46
LACHA PARATHA,53,41
ONION KULCHA,52,39
RASMALAI,49,35


So, this is the above final result of predicted menu_item and item_quantity with top selling menu's as Garlic naan,  Butten Chicken, naan, Rice etc with respective dinner sales as 179,117,94,77 and lunch sales 21, 17, 21, 18 etc