# Importing all necessary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense,Dropout
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping

# Reading the dataset

In [2]:
df=pd.read_csv('E-commerce_Shipping_Data.csv')
df

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10994,10995,A,Ship,4,1,252,5,medium,F,1,1538,1
10995,10996,B,Ship,4,1,232,5,medium,F,6,1247,0
10996,10997,C,Ship,5,4,242,5,low,F,4,1155,0
10997,10998,F,Ship,5,2,223,6,medium,M,2,1210,0


# Checking for null values present in dataset

In [3]:
df.isnull().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
Reached.on.Time_Y.N    0
dtype: int64

# As we can see there are no null values present in the dataset

# Checking for missing values in dataset

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


# As we can see here there are no missing values in the dataset

# Checking for descriptive statisitcs

In [5]:
df.describe()

Unnamed: 0,ID,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
count,10999.0,10999.0,10999.0,10999.0,10999.0,10999.0,10999.0,10999.0
mean,5500.0,4.054459,2.990545,210.196836,3.567597,13.373216,3634.016729,0.596691
std,3175.28214,1.14149,1.413603,48.063272,1.52286,16.205527,1635.377251,0.490584
min,1.0,2.0,1.0,96.0,2.0,1.0,1001.0,0.0
25%,2750.5,3.0,2.0,169.0,3.0,4.0,1839.5,0.0
50%,5500.0,4.0,3.0,214.0,3.0,7.0,4149.0,1.0
75%,8249.5,5.0,4.0,251.0,4.0,10.0,5050.0,1.0
max,10999.0,7.0,5.0,310.0,10.0,65.0,7846.0,1.0


# Descriptive statistics shows the mean and median values of the columns in dataset. This shows that all the columns have normal skewness except 2 columns which shows left and right skewness but does not affect the output categorical columns

# Converting the categorical data into numerical values

In [6]:
from sklearn.preprocessing import OrdinalEncoder
oe=OrdinalEncoder()
df[['Warehouse_block','Mode_of_Shipment','Product_importance','Gender']]=oe.fit_transform(df[['Warehouse_block','Mode_of_Shipment','Product_importance','Gender']])

In [7]:
df

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,3.0,0.0,4,2,177,3,1.0,0.0,44,1233,1
1,2,4.0,0.0,4,5,216,2,1.0,1.0,59,3088,1
2,3,0.0,0.0,2,2,183,4,1.0,1.0,48,3374,1
3,4,1.0,0.0,3,3,176,4,2.0,1.0,10,1177,1
4,5,2.0,0.0,2,2,184,3,2.0,0.0,46,2484,1
...,...,...,...,...,...,...,...,...,...,...,...,...
10994,10995,0.0,2.0,4,1,252,5,2.0,0.0,1,1538,1
10995,10996,1.0,2.0,4,1,232,5,2.0,0.0,6,1247,0
10996,10997,2.0,2.0,5,4,242,5,1.0,0.0,4,1155,0
10997,10998,4.0,2.0,5,2,223,6,2.0,1.0,2,1210,0


# As we can see all the categorical columns have been converted into their numerical form

# Checking the correlation between columns

In [8]:
df.corr().style.background_gradient()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
ID,1.0,7e-05,-0.002459,0.188998,-0.005722,0.196791,0.145369,0.029081,-0.001695,-0.598278,0.278312,-0.411822
Warehouse_block,7e-05,1.0,0.000617,0.014496,0.010169,-0.006679,-0.005262,0.00426,-0.0037,0.009569,0.004086,0.005214
Mode_of_Shipment,-0.002459,0.000617,1.0,-0.020164,0.001679,0.006681,-0.00164,0.004911,-0.011288,0.009364,-0.000797,-0.000535
Customer_care_calls,0.188998,0.014496,-0.020164,1.0,0.012209,0.323182,0.180771,0.006273,0.002545,-0.13075,-0.276615,-0.067126
Customer_rating,-0.005722,0.010169,0.001679,0.012209,1.0,0.00927,0.013179,0.003157,0.002775,-0.003124,-0.001897,0.013119
Cost_of_the_Product,0.196791,-0.006679,0.006681,0.323182,0.00927,1.0,0.123676,0.006366,0.019759,-0.138312,-0.132604,-0.073587
Prior_purchases,0.145369,-0.005262,-0.00164,0.180771,0.013179,0.123676,1.0,0.003662,-0.009395,-0.082769,-0.168213,-0.055515
Product_importance,0.029081,0.00426,0.004911,0.006273,0.003157,0.006366,0.003662,1.0,-0.009865,-0.006251,0.001652,-0.023483
Gender,-0.001695,-0.0037,-0.011288,0.002545,0.002775,0.019759,-0.009395,-0.009865,1.0,-0.011777,0.003573,0.004689
Discount_offered,-0.598278,0.009569,0.009364,-0.13075,-0.003124,-0.138312,-0.082769,-0.006251,-0.011777,1.0,-0.376067,0.397108


# As we can see, only the column Discount_offered is affecting the output target column strongly

# Dropping Unecessary columns

In [9]:
df.drop(['ID'],axis=1,inplace=True)

In [10]:
df

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,3.0,0.0,4,2,177,3,1.0,0.0,44,1233,1
1,4.0,0.0,4,5,216,2,1.0,1.0,59,3088,1
2,0.0,0.0,2,2,183,4,1.0,1.0,48,3374,1
3,1.0,0.0,3,3,176,4,2.0,1.0,10,1177,1
4,2.0,0.0,2,2,184,3,2.0,0.0,46,2484,1
...,...,...,...,...,...,...,...,...,...,...,...
10994,0.0,2.0,4,1,252,5,2.0,0.0,1,1538,1
10995,1.0,2.0,4,1,232,5,2.0,0.0,6,1247,0
10996,2.0,2.0,5,4,242,5,1.0,0.0,4,1155,0
10997,4.0,2.0,5,2,223,6,2.0,1.0,2,1210,0


# Splitting the dataset

In [11]:
x=df.iloc[:,0:-1].values
x

array([[3.000e+00, 0.000e+00, 4.000e+00, ..., 0.000e+00, 4.400e+01,
        1.233e+03],
       [4.000e+00, 0.000e+00, 4.000e+00, ..., 1.000e+00, 5.900e+01,
        3.088e+03],
       [0.000e+00, 0.000e+00, 2.000e+00, ..., 1.000e+00, 4.800e+01,
        3.374e+03],
       ...,
       [2.000e+00, 2.000e+00, 5.000e+00, ..., 0.000e+00, 4.000e+00,
        1.155e+03],
       [4.000e+00, 2.000e+00, 5.000e+00, ..., 1.000e+00, 2.000e+00,
        1.210e+03],
       [3.000e+00, 2.000e+00, 2.000e+00, ..., 0.000e+00, 6.000e+00,
        1.639e+03]])

In [12]:
y=df.iloc[:,-1].values
y

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

# Data Scaling

In [13]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
x=sc.fit_transform(x)

# Splitting the dataset into training and testing dataset

In [14]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.3,random_state=0)

In [15]:
xtrain

array([[ 1.11803399,  0.63834175, -0.04771132, ..., -0.99176046,
        -0.57842252, -0.17306685],
       [ 1.11803399,  0.63834175, -0.92379938, ..., -0.99176046,
        -0.64013267,  0.96189107],
       [-1.56534517,  0.63834175, -0.92379938, ...,  1.00830799,
         2.38366454, -1.25176609],
       ...,
       [ 1.11803399, -0.68290796, -0.04771132, ...,  1.00830799,
        -0.64013267,  0.58459094],
       [ 1.11803399, -0.68290796, -0.04771132, ...,  1.00830799,
        -0.70184282, -1.16615504],
       [-1.56534517, -0.68290796, -0.04771132, ..., -0.99176046,
         0.22380939,  0.04952188]])

In [16]:
ytrain

array([1, 1, 1, ..., 0, 1, 1], dtype=int64)

In [17]:
xtest

array([[-1.56534517, -2.00415767,  0.82837675, ...,  1.00830799,
        -0.57842252,  0.49714537],
       [ 1.11803399, -2.00415767,  1.70446482, ..., -0.99176046,
        -0.70184282, -1.41136955],
       [ 1.11803399,  0.63834175, -1.79988745, ...,  1.00830799,
        -0.20816164, -0.13759942],
       ...,
       [ 1.11803399,  0.63834175, -0.92379938, ..., -0.99176046,
        -0.57842252,  0.49408783],
       [ 1.11803399, -0.68290796,  0.82837675, ..., -0.99176046,
        -0.51671238,  1.07868901],
       [-1.56534517,  0.63834175,  0.82837675, ...,  1.00830799,
        -0.57842252, -1.00532857]])

In [18]:
ytest

array([1, 0, 1, ..., 1, 1, 0], dtype=int64)

# Building the model

In [19]:
ann=Sequential()

# Adding the hidden and output Layers

In [30]:
ann.add(Dense(500,activation='relu'))
ann.add(Dropout(rate=0.3))
ann.add(Dense(300,activation='relu'))
ann.add(Dropout(rate=0.2))
ann.add(Dense(100,activation='relu'))
ann.add(Dropout(rate=0.1))
ann.add(Dense(1,activation='sigmoid'))

In [31]:
es=EarlyStopping(monitor='val_loss',patience=25,verbose=1,mode='min')

In [32]:
ann.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

# Training the model

In [33]:
history=ann.fit(xtrain,ytrain,epochs=300,batch_size=100,validation_data=(xtest,ytest),callbacks=[es],verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 29: early stopping


# Making predictions

In [34]:
ypred=ann.predict(xtest)



In [35]:
ypred=np.where(ypred>0.5,1,0)
ypred

array([[1],
       [1],
       [1],
       ...,
       [0],
       [0],
       [1]])

# Checking Accuracy

In [36]:
from sklearn.metrics import classification_report
print(classification_report(ytest,ypred))

              precision    recall  f1-score   support

           0       0.58      0.66      0.62      1379
           1       0.73      0.65      0.69      1921

    accuracy                           0.65      3300
   macro avg       0.65      0.66      0.65      3300
weighted avg       0.66      0.65      0.66      3300



# Accuracy of our model is 65%