### Importing basic libraries

In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

### Let's import the dataset and do some preprocessing

In [70]:
df = pd.read_csv('sales_data.csv')
df.head()

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,253,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,252,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,251,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,250,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


In [71]:
df.isnull().sum()

ID               0
Store_id         0
Store_Type       0
Location_Type    0
Region_Code      0
Date             0
Holiday          0
Discount         0
#Order           0
Sales            0
dtype: int64

In [72]:
df.shape

(188340, 10)

In [73]:
df['Store_id'].value_counts()

Store_id
364    516
1      516
253    516
252    516
251    516
      ... 
238    516
239    516
240    516
241    516
242    516
Name: count, Length: 365, dtype: int64

In [74]:
df['ID'].value_counts()

ID
T1188340    1
T1000001    1
T1000002    1
T1000003    1
T1188324    1
           ..
T1000009    1
T1000008    1
T1000007    1
T1000006    1
T1000005    1
Name: count, Length: 188340, dtype: int64

### ID seems unique but store ID isn't . lets drop it

In [75]:
df.drop(columns='Store_id',inplace=True)
df.head()

Unnamed: 0,ID,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,S1,L3,R1,2018-01-01,1,Yes,9,7011.84
1,T1000002,S4,L2,R1,2018-01-01,1,Yes,60,51789.12
2,T1000003,S3,L2,R1,2018-01-01,1,Yes,42,36868.2
3,T1000004,S2,L3,R1,2018-01-01,1,Yes,23,19715.16
4,T1000005,S2,L3,R4,2018-01-01,1,Yes,62,45614.52


## Univariate analysis

In [76]:
df['Store_Type'].value_counts()

Store_Type
S1    88752
S4    45924
S2    28896
S3    24768
Name: count, dtype: int64

### There seem to be 4 store types, lets see what are the percentages

In [77]:
import plotly.express as px

pie1 = df['Store_Type'].value_counts()
store = pie1.index
number = pie1.values

fig = px.pie(pie1, values= number, names= store)
fig.show()

#### High percentage of S1 type stores

In [78]:
# Checking location type column

df['Location_Type'].value_counts()

Location_Type
L1    85140
L2    48504
L3    29928
L5    13932
L4    10836
Name: count, dtype: int64

In [79]:
pie2 = df['Location_Type'].value_counts()
location = pie2.index
number = pie2.values

fig = px.pie(pie2, values= number, names= location)
fig.show()

#### Lot of the IDs are at location L1

In [80]:
# Looking at Region code

df['Region_Code'].value_counts()

Region_Code
R1    63984
R2    54180
R3    44376
R4    25800
Name: count, dtype: int64

In [81]:
pie3 = df['Region_Code'].value_counts()
region = pie3.index
number = pie3.values

fig = px.pie(pie3, values= number, names= region)
fig.show()

In [82]:
df.dtypes

ID                object
Store_Type        object
Location_Type     object
Region_Code       object
Date              object
Holiday            int64
Discount          object
#Order             int64
Sales            float64
dtype: object

### Date seems to be an object. let's change the format

In [83]:
df['Date'] = pd.to_datetime(df['Date'])

### Lets now check the distribution of discounts 

In [84]:
pie4 = df['Discount'].value_counts()
discount = pie3.index
number = pie3.values

fig = px.pie(pie4, values= number, names= discount)
fig.show()

### Checking the holidays section

In [85]:
pie5 = df['Holiday'].value_counts()
holiday = pie5.index
number = pie5.values

fig = px.pie(pie5, values= number, names= holiday)
fig.show()

### Modifying some columns

In [86]:
df['Discount'] = df['Discount'].map({'No':0,'Yes':1})
df['Store_Type'] = df['Store_Type'].map({'S1':1,'S2':2,'S3':3,'S4':4})
df['Location_Type'] = df['Location_Type'].map({'L1':1,'L2':2,'L3':3,'L4':4,'L5':5})
df['Region_Code'] = df['Region_Code'].map({'R1':1,'R2':2,'R3':3,'R4':4})
df.head()

Unnamed: 0,ID,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,T1000001,1,3,1,2018-01-01,1,1,9,7011.84
1,T1000002,4,2,1,2018-01-01,1,1,60,51789.12
2,T1000003,3,2,1,2018-01-01,1,1,42,36868.2
3,T1000004,2,3,1,2018-01-01,1,1,23,19715.16
4,T1000005,2,3,4,2018-01-01,1,1,62,45614.52


## Now we start creating the x and y variables along with the train test split

In [87]:
x = np.array(df[['Store_Type','Location_Type','Region_Code','Holiday','Discount']])
y = np.array(df['#Order'])

In [88]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=42)
y_test

array([ 54, 111,  59, ...,  40,  69,  68])

In [89]:
# We are gonna use the Light Gradient Boosting Model to predict or forecast the number of orders

import lightgbm as lgbm

model =  lgbm.LGBMRegressor()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
y_pred

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003872 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 20
[LightGBM] [Info] Number of data points in the train set: 150672, number of used features: 5
[LightGBM] [Info] Start training from score 68.163401



X does not have valid feature names, but LGBMRegressor was fitted with feature names



array([ 50.19023435, 104.21578729,  68.08376523, ...,  45.33995367,
        62.53550356,  84.79858358])

## Lets now evaluate the model using Median Absolute Error

In [90]:
from sklearn.metrics import median_absolute_error

error = median_absolute_error(y_test,y_pred)
error

np.float64(9.241701185473417)

#### So on an average, the number of orders is approximately 9 off from the real value