In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split as splt
# for handling the outliers
from sklearn.ensemble import IsolationForest
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
from tensorflow.keras.optimizers import Adadelta
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from tensorflow.keras.models import load_model
from tensorflow.keras.models import save_model

## 1 IMPORTING THE DATA

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [29]:
print(train.head())
print(train.tail())

        id  week  checkout_price  base_price  num_orders  101  102  104  106  \
0  1379560     1          136.83      152.29         177    0    0    0    0   
1  1466964     1          136.83      135.83         270    0    0    0    0   
2  1346989     1          134.86      135.86         189    0    0    0    0   
3  1338232     1          339.50      437.53          54    0    0    0    0   
4  1448490     1          243.50      242.50          40    0    0    0    0   

   108  ...  2640  2664  2704  2707  2760  2826  2867  2956  1  1  
0    0  ...     0     0     0     0     0     0     0     0  0  0  
1    0  ...     0     0     0     0     0     0     0     0  0  0  
2    0  ...     0     0     0     0     0     0     0     0  0  0  
3    0  ...     0     0     0     0     0     0     0     0  0  0  
4    0  ...     0     0     0     0     0     0     0     0  0  0  

[5 rows x 133 columns]
             id  week  checkout_price  base_price  num_orders  101  102  104  \
456543 

In [30]:
print(test.head())
print(test.tail())

        id  week  checkout_price  base_price  101  102  104  106  108  109  \
0  1028232   146          158.11      159.11    0    0    0    0    0    0   
1  1127204   146          160.11      159.11    0    0    0    0    0    0   
2  1212707   146          157.14      159.14    0    0    0    0    0    0   
3  1082698   146          162.02      162.02    0    0    0    0    0    0   
4  1400926   146          163.93      163.93    0    0    0    0    0    0   

   ...  2640  2664  2704  2707  2760  2826  2867  2956  1  1  
0  ...     0     0     0     0     0     0     0     0  0  0  
1  ...     0     0     0     0     0     0     0     0  0  0  
2  ...     0     0     0     0     0     0     0     0  0  0  
3  ...     0     0     0     0     0     0     0     0  0  0  
4  ...     0     0     0     0     0     0     0     0  0  0  

[5 rows x 132 columns]
            id  week  checkout_price  base_price  101  102  104  106  108  \
32568  1250239   155          482.09      484.09    

## 2 FEATURE ENGINEERING
- **2.1. CHANGING THE DATA TYPE**

In [5]:
X = train.drop('id,week,num_orders,checkout_price,base_price'.split(','),axis=1)
Y = test.drop('id,week,checkout_price,base_price'.split(','),axis=1)

In [6]:
X = X.astype(str)
Y = Y.astype(str)

In [26]:
print(X.describe())
print(Y.describe())

       checkout_price     base_price            101            102  \
count   423603.000000  423603.000000  423603.000000  423603.000000   
mean       340.704680     361.137211       0.013765       0.011317   
std        153.812665     161.269561       0.116515       0.105779   
min          2.970000      55.350000       0.000000       0.000000   
25%        241.530000     244.500000       0.000000       0.000000   
50%        309.430000     317.220000       0.000000       0.000000   
75%        446.230000     470.510000       0.000000       0.000000   
max        767.330000     767.330000       1.000000       1.000000   

                 104            106            108           109  \
count  423603.000000  423603.000000  423603.000000  423603.00000   
mean        0.013593       0.012944       0.014211       0.01368   
std         0.115794       0.113032       0.118362       0.11616   
min         0.000000       0.000000       0.000000       0.00000   
25%         0.000000       0.

In [8]:
train['center_id,meal_id,emailer_for_promotion,homepage_featured'.split(',')] = X
test['center_id,meal_id,emailer_for_promotion,homepage_featured'.split(',')] = Y

In [27]:
train.describe(include='all')

Unnamed: 0,id,week,checkout_price,base_price,num_orders,101,102,104,106,108,...,2640,2664,2704,2707,2760,2826,2867,2956,1,1.1
count,423603.0,423603.0,423603.0,423603.0,423603.0,423603.0,423603.0,423603.0,423603.0,423603.0,...,423603.0,423603.0,423603.0,423603.0,423603.0,423603.0,423603.0,423603.0,423603.0,423603.0
mean,1250083.0,74.904432,340.70468,361.137211,182.205542,0.013765,0.011317,0.013593,0.012944,0.014211,...,0.025082,0.0214,0.022986,0.019584,0.023878,0.022245,0.019027,0.00783,0.065347,0.087842
std,144348.0,41.587735,153.812665,161.269561,169.519667,0.116515,0.105779,0.115794,0.113032,0.118362,...,0.156376,0.144713,0.149859,0.138567,0.152671,0.147479,0.136621,0.088143,0.247137,0.283065
min,1000000.0,1.0,2.97,55.35,13.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1125012.0,39.0,241.53,244.5,53.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1250096.0,76.0,309.43,317.22,122.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1375126.0,111.0,446.23,470.51,270.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1499999.0,145.0,767.33,767.33,729.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [28]:
test.describe(include='all')

Unnamed: 0,id,week,checkout_price,base_price,101,102,104,106,108,109,...,2640,2664,2704,2707,2760,2826,2867,2956,1,1.1
count,32573.0,32573.0,32573.0,32573.0,32573.0,32573.0,32573.0,32573.0,32573.0,32573.0,...,32573.0,32573.0,32573.0,32573.0,32573.0,32573.0,32573.0,32573.0,32573.0,32573.0
mean,1248476.0,150.477819,341.85444,356.493615,0.01314,0.010438,0.014491,0.012587,0.014951,0.013447,...,0.022565,0.023639,0.022227,0.023455,0.021859,0.023639,0.016179,0.013078,0.066435,0.081356
std,144158.0,2.864072,153.893886,155.150101,0.113875,0.101634,0.119503,0.111486,0.121359,0.115179,...,0.148513,0.151925,0.147423,0.151346,0.146224,0.151925,0.126166,0.113612,0.249045,0.273385
min,1000085.0,146.0,67.9,89.24,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1123969.0,148.0,214.43,243.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1247296.0,150.0,320.13,321.13,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1372971.0,153.0,446.23,455.93,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1499996.0,155.0,1113.62,1112.62,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


- **REMOVING OUTLIERS**

In [11]:
def outlier(data):
    qmax = {}
    qmin = {}
    for i in list(data.columns):
        if(np.issubdtype(data[i].dtype, np.number) == True):
            iqr = data[i].quantile(0.75) - data[i].quantile(0.25)
            qmax[i] = data[i].quantile(0.75) + 1.5*iqr
            qmin[i] = data[i].quantile(0.25) - 1.5*iqr

    for i in list(data.columns):
        if(np.issubdtype(data[i].dtype, np.number) == True):
            data = data.drop(data[(data[i] > qmax[i]) | (data[i] < qmin[i])].index)
    return data

In [12]:
train = outlier(train)
train.shape

(423603, 9)

- **2.2 CREATING DUMMY VARIABLES**

In [13]:
dataset = train
for i in 'center_id,meal_id,emailer_for_promotion,homepage_featured'.split(','):
    data = pd.get_dummies(dataset[i],drop_first=True)
    dataset = pd.concat([dataset,data],axis=1)
train = dataset
train.drop('center_id,meal_id,emailer_for_promotion,homepage_featured'.split(','),axis=1,inplace=True)

In [14]:
dataset = test
for i in 'center_id,meal_id,emailer_for_promotion,homepage_featured'.split(','):
    data = pd.get_dummies(dataset[i],drop_first=True)
    dataset = pd.concat([dataset,data],axis=1)
test = dataset
test.drop('center_id,meal_id,emailer_for_promotion,homepage_featured'.split(','),axis=1,inplace=True)

In [24]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 423603 entries, 0 to 456547
Columns: 133 entries, id to 1
dtypes: float64(2), int64(3), uint8(128)
memory usage: 71.1 MB


In [25]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32573 entries, 0 to 32572
Columns: 132 entries, id to 1
dtypes: float64(2), int64(2), uint8(128)
memory usage: 5.0 MB


In [17]:
new_train = train.drop(['id','week'],axis=1)

## 3 SEPARATING THE INDEPENDENT AND DEPENDENT VARIABLES

In [18]:
X = new_train.drop('num_orders',axis=1)
Y = new_train['num_orders']

In [19]:
# X.head()

In [20]:
new_test = test.drop(['id','week'],axis=1)

In [21]:
# new_test.head()

## 4 FEATURE SCALING

In [22]:
scale = StandardScaler()

In [23]:
data = X
# fitting and transforming the X into feature scaling
X = scale.fit_transform(X)
# converting again into dataframe
X = pd.DataFrame(X,columns=data.columns)
data = new_test
# fitting and transforming the test into feature scaling
new_test = scale.transform(new_test)
# converting again into dataframe
new_test = pd.DataFrame(new_test,columns=data.columns)

MemoryError: Unable to allocate array with shape (130, 423603) and data type float64

In [None]:
X.shape

In [None]:
new_test.shape

In [None]:
print(X.head())
print(new_test.head())

## 5 SPLITTING THE X DATA INTO TRAIN AND TEST

In [None]:
X_train, X_test, Y_train, Y_test = splt(X, Y, test_size=0.9781, random_state=42)

In [None]:
X_train.info()

## 6 SAMPLE MODEL

In [31]:
def build_regressor():
    regressor = Sequential()
    regressor.add(Dense(units=66,kernel_initializer='normal',
                     activation='relu',input_dim=130))
    regressor.add(Dense(units=66,kernel_initializer='normal',
                     activation='relu'))
    regressor.add(Dense(units=66,kernel_initializer='normal',
                     activation='relu'))
    regressor.add(Dense(units=66,kernel_initializer='normal',
                     activation='relu'))
    regressor.add(Dense(units=66,kernel_initializer='normal',
                     activation='relu'))
    regressor.add(Dense(units=1,kernel_initializer='normal',
                     activation='relu'))
    regressor.compile(optimizer=Adadelta(),loss='mean_squared_logarithmic_error')
    return regressor
regressor = KerasRegressor(build_fn= build_regressor,batch_size=10,epochs=150)

In [32]:
regressor.fit(x=X,y=Y)

Train on 423603 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/1

<tensorflow.python.keras.callbacks.History at 0x7f5395679048>

In [33]:
regressor.model.save('model3.h5')

In [None]:
# regressor.fit(x=X_train,y=Y_train)

In [None]:
# parameters = {'batch_size': [10,100],
#              'epochs': [100,500]}

In [None]:
# grid_search = GridSearchCV(estimator=regressor,
#                           param_grid=parameters,
#                           cv=8)

In [None]:
# grid_search = grid_search.fit(X=X_train,y=Y_train)

In [None]:
# batch=100,epochs=200,genpact score=58
msle = cross_val_score(estimator=regressor, X=X_train, y=Y_train, cv=8, n_jobs= -1)

In [None]:
msle

In [None]:
# batch=10,epochs=100,genpact score=
msle2 = cross_val_score(estimator=regressor, X=X_train, y=Y_train, cv=8, n_jobs= -1)

In [None]:
msle2

### With Week And Different Tuning get score 103

In [None]:
Y_pred = regressor.predict(new_test)

In [None]:
Y_pred

In [None]:
# id,num_orders

In [None]:
submission = pd.DataFrame(Y_pred,columns=['num_orders'])

In [None]:
submission = pd.concat([test['id'],submission],axis=1)

In [None]:
submission.info()

In [None]:
submission.head()

In [None]:
test[['id']].head()

In [None]:
test.shape

In [None]:
submission.shape

In [None]:
submission.to_csv('submission.csv',index=False)

### New Predition (Y_pred2)

In [None]:
Y_pred2 = regressor.predict(new_test)

In [None]:
submission2 = pd.DataFrame(Y_pred2,columns=['num_orders'])

In [None]:
submission2 = pd.concat([test['id'],submission2],axis=1)

In [None]:
submission2.describe()

In [None]:
submission2.head()

In [None]:
submission2.to_csv('submission2.csv',index=False)

## How To Save And Reload Model

- **SAVING THE MODEL**

In [None]:
# My model name is regressor and I am using model.save function to the model
# in HDF5 format.
regressor.model.save('model.h5')

- **RELOADING THE MODEL**

In [None]:
# Reassigning the model after loading the model
model = load_model('model.h5')

In [None]:
# with batch=10, epochs=100
regressor.model.save('model2.h5')

In [None]:
Y_pred = regressor.predict(new_test)

In [None]:
Y_pred

In [None]:
model2 = load_model('model2.h5')

In [None]:
Y_pred2 = model2.predict(new_test)

In [None]:
Y_pred2