# **SALES PREDICTION**

Saurabh Kailas (210905033)

Varun Reddy (210905131)

Suhas Reddy (210905075)

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import statsmodels.api as sm

from sklearn.preprocessing import MinMaxScaler

import pickle
from os import path

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

from keras.models import Sequential
from keras.layers import Dense

ModuleNotFoundError: No module named 'xgboost'

# **DATA PREPROCESSING**

**1. Importing Datasets**

In [None]:
data = pd.read_csv('train.csv')
stores = pd.read_csv('stores.csv')      # Dataset With Stores Information
features = pd.read_csv('features.csv')  # Dataset With Additional Data

In [None]:
data.shape

In [None]:
data.head()

In [None]:
stores.shape

In [None]:
stores.head()

In [None]:
features.shape

In [None]:
features.tail()

**2. Handling Missing Values**

In [None]:
# We fill in the CPI (Consumer Price Index, which is a measure of the average
# purchases of the customers) with the median value of that column
features["CPI"].fillna(features["CPI"].median(),inplace=True)

In [None]:
# We fill in the Unemployment (Rate) with the median value of that column
features["Unemployment"].fillna(features["Unemployment"].median(),inplace=True)

In [None]:
for i in range(1,6):
  features["MarkDown"+str(i)] = features["MarkDown"+str(i)].apply(lambda x: 0 if x < 0 else x)
  features["MarkDown"+str(i)].fillna(value=0,inplace=True)

**3. Merge the Datasets**

In [None]:
data = pd.merge(data,stores,on='Store', how='left')
data = pd.merge(data,features,on=['Store','Date'], how='left')

In [None]:
data['Date'] = pd.to_datetime(data['Date'])   # Convert into Date Objects
data.sort_values(by=['Date'],inplace=True)    # Sort the Entries Based on Date

In [None]:
data.set_index(data.Date, inplace=True)                # The Date attribute is used as Index to Identify the Entries
data['IsHoliday_x'].isin(data['IsHoliday_y']).all()
data.drop(columns='IsHoliday_x',inplace=True)
data.rename(columns={"IsHoliday_y" : "IsHoliday"}, inplace=True)

In [None]:
data.head()

**4. Split the Date Attribute**

In [None]:
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Week'] = data['Date'].dt.isocalendar().week

In [None]:
data.head()

**5. Outlier Detection**

In [None]:
# We group on the columns Store and Dept and then take the Weekly_Sales column in each group.
# We apply an aggregate function on this column in order to find the maximum, minimum, mean, median,
# and standard deviation values. We then ensure there are no missing values.

In [None]:
agg_data = data.groupby(['Store', 'Dept']).Weekly_Sales.agg(['max', 'min', 'mean', 'median', 'std']).reset_index()
agg_data.isnull().sum()

In [None]:
# We now add the previously calculated aggregated data to the original dataframe.
# Then, we remove all rows that still have missing values.

In [None]:
store_data = pd.merge(left=data,right=agg_data,on=['Store', 'Dept'],how ='left')
store_data.dropna(inplace=True)
data = store_data.copy()
del store_data

data['Date'] = pd.to_datetime(data['Date'])
data.sort_values(by=['Date'],inplace=True)
data.set_index(data.Date, inplace=True)
data.head()

In [None]:
# Now, we create a new column which contains the sum of the corresponding MarkDowns.
# Markdowns represent a reduction in the price of the products. This column contains the total reductions
# for the corresponding dates. We then drop the individual MarkDown columns.

In [None]:
data['Total_MarkDown'] = data['MarkDown1']+data['MarkDown2']+data['MarkDown3']+data['MarkDown4']+data['MarkDown5']
data.drop(['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5'], axis = 1,inplace=True)

In [3]:
# Now, we scale the numeric data values to a smaller range by using Z-score.

In [4]:
numeric_col = ['Weekly_Sales','Size','Temperature','Fuel_Price','CPI','Unemployment','Total_MarkDown']
data_numeric = data[numeric_col].copy()
data = data[(np.abs(stats.zscore(data_numeric)) < 2.5).all(axis = 1)]
data.shape

NameError: name 'data' is not defined

In [None]:
# We ensure that all the weekly sales values are greater than or equal to zero
data=data[data['Weekly_Sales']>=0]

In [None]:
# We convert the holiday column into numeric values
data['IsHoliday'] = data['IsHoliday'].astype('int')

In [None]:
data.head()

In [None]:
# Store the cleaned and preprocessed data.
data.to_csv('./preprocessed_dataset.csv')

# **Data Visualisation**

**1. Average Monthly Sales**

In [None]:
plt.figure(figsize=(14,8))
sns.barplot(x='Month',y='Weekly_Sales',data=data)
plt.ylabel('Sales',fontsize=14)
plt.xlabel('Months',fontsize=14)
plt.title('Average Monthly Sales',fontsize=16)
plt.savefig('avg_monthly_sales.png')
plt.grid()

**2. Monthly Sales per Year**

In [None]:
data_monthly = pd.crosstab(data["Year"], data["Month"], values=data["Weekly_Sales"],aggfunc='sum')
data_monthly

fig, axes = plt.subplots(3,4,figsize=(16,8))
plt.suptitle('Monthly Sales for each Year', fontsize=18)
k=1
for i in range(3):
    for j in range(4):
      sns.lineplot(ax=axes[i,j],data=data_monthly[k])
      plt.subplots_adjust(wspace=0.4,hspace=0.32)
      plt.ylabel(k,fontsize=12)
      plt.xlabel('Years',fontsize=12)
      k+=1

plt.savefig('monthly_sales_every_year.png')
plt.show()

# **Time Series Decompose**

In [None]:
# We break the dataset into 3 components.
# The Trend Component shows how the data increases or decreases over time.
# The Seasonal Component represents the repeating patterns that appear consistenly over the same time intervals.
# The Residual Component represents the fluctuations in the data not belonging to the trend or seasonal patterns.

In [None]:
sm.tsa.seasonal_decompose(data['Weekly_Sales'].resample('MS').mean(), model='additive').plot()
plt.savefig('seasonal_decompose.png')
plt.show()

# **One-Hot Encoding**

One-Hot Encoding is used to convert categorical values into numerical values. Each category is represented using a binary variable. If True or 1, then the corresponding category is present, else it is not present.

In [None]:
# Now, we take the categorical value columns.
cat_col = ['Store','Dept','Type']
data_cat = data[cat_col].copy()

In [None]:
data_cat.head()

In [None]:
# We perform One-Hot Encoding on the categorical values.
data_cat = pd.get_dummies(data_cat,columns=cat_col)

In [None]:
data_cat = data_cat.astype(int)

In [None]:
data_cat.head()

In [None]:
data = pd.concat([data, data_cat],axis=1)
data.drop(columns=cat_col,inplace=True)
data.drop(columns=['Date'],inplace=True)

In [None]:
data.head()

# **Data Normalisation**

In [None]:
num_col = ['Weekly_Sales','Size','Temperature','Fuel_Price','CPI','Unemployment','Total_MarkDown','max','min','mean','median','std']

In [None]:
# We normalise the data using MinMax Scaling.
# We fit all the values in the range of 0 to 1.

In [None]:
minmax_scale = MinMaxScaler(feature_range=(0, 1))
def normalization(df,col):
  for i in col:
    arr = df[i]
    arr = np.array(arr)
    df[i] = minmax_scale.fit_transform(arr.reshape(len(arr),1))
  return df

In [None]:
data = normalization(data.copy(),num_col)

In [None]:
data.head()

# **Split the Data into Training and Testing Sets**

In [None]:
X = data.drop(['Weekly_Sales'],axis=1)
Y = data.Weekly_Sales

In [None]:
# If the code is run multiple times with the same random_state value, the same
# split of the data into training and testing sets will be obtained every time.
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.20, random_state=50)

X : Independent Variables.

Y : Dependent Variables.

X_train : Subset of X for training the model.

X_test : Subset of X for testing the model.

y_train : Subset of Y for training the model.

y_test : Subset of Y for testing the model.

# **Linear Regression Model**

In [None]:
# The fit methods finds the regression coefficients that best fit the training data.
# The coefficients are calculated by minimising the sum of squares between the observed target values
# and the predicted values.
lr = LinearRegression()
lr.fit(X_train, y_train)

In [None]:
lr_acc = lr.score(X_test,y_test)*100
print("Linear Regressor Accuracy - ",lr_acc)

In [None]:
# Make predictions for the X_test data.
y_pred = lr.predict(X_test)

In [None]:
print("MAE" , metrics.mean_absolute_error(y_test, y_pred))
print("MSE" , metrics.mean_squared_error(y_test, y_pred))
print("RMSE" , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2" , metrics.explained_variance_score(y_test, y_pred))

In [None]:
plt.figure(figsize=(20,8))
plt.title("Linear Regression Model Predictions")
plt.plot(lr.predict(X_test[:200]), label="prediction", linewidth=2.0,color='blue')
plt.plot(y_test[:200].values, label="real_values", linewidth=2.0,color='lightcoral')
plt.legend(loc="best")
plt.savefig('lr_real_pred.png')
plt.show()

# **XGBoost Model**

In [None]:
xgbr = XGBRegressor()
xgbr.fit(X_train, y_train)

In [None]:
xgb_acc = xgbr.score(X_test,y_test)*100
print("XGBoost Regressor Accuracy - ",xgb_acc)

In [None]:
y_pred = xgbr.predict(X_test)

In [None]:
print("MAE" , metrics.mean_absolute_error(y_test, y_pred))
print("MSE" , metrics.mean_squared_error(y_test, y_pred))
print("RMSE" , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2" , metrics.explained_variance_score(y_test, y_pred))

In [None]:
plt.figure(figsize=(20,8))
plt.title("XGBoost Predictions")
plt.plot(xgbr.predict(X_test[:200]), label="prediction", linewidth=2.0,color='blue')
plt.plot(y_test[:200].values, label="real_values", linewidth=2.0,color='lightcoral')
plt.legend(loc="best")
plt.savefig('xgb_real_pred.png')
plt.show()

# **K-Nearest Neighbours Model**

In [None]:
knn = KNeighborsRegressor(n_neighbors = 1,weights = 'uniform')
knn.fit(X_train,y_train)

In [None]:
knn_acc = knn.score(X_test, y_test)*100
print("KNN Regressor Accuracy - ",knn_acc)

In [None]:
y_pred = knn.predict(X_test)

In [None]:
print("MAE" , metrics.mean_absolute_error(y_test, y_pred))
print("MSE" , metrics.mean_squared_error(y_test, y_pred))
print("RMSE" , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2" , metrics.explained_variance_score(y_test, y_pred))

In [None]:
plt.figure(figsize=(20,8))
plt.plot(knn.predict(X_test[:200]), label="prediction", linewidth=2.0,color='blue')
plt.plot(y_test[:200].values, label="real_values", linewidth=2.0,color='lightcoral')
plt.legend(loc="best")
plt.savefig('knn_real_pred.png')
plt.show()

# **Random Forest**

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [None]:
rf_acc = rf.score(X_test,y_test)*100
print("Random Forest Regressor Accuracy - ",rf_acc)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
print("MAE" , metrics.mean_absolute_error(y_test, y_pred))
print("MSE" , metrics.mean_squared_error(y_test, y_pred))
print("RMSE" , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2" , metrics.explained_variance_score(y_test, y_pred))

In [None]:
plt.figure(figsize=(20,8))
plt.plot(rf.predict(X_test[:200]), label="prediction", linewidth=2.0,color='blue')
plt.plot(y_test[:200].values, label="real_values", linewidth=2.0,color='lightcoral')
plt.legend(loc="best")
plt.savefig('rf_real_pred.png')
plt.show()

# **Deep Neural Network**

We create a simple neural network for regression. The first layer has 64 neurons and uses ReLU as activation function. This layer takes the input from the X_train data. The second layer has 32 neurons. The third layer has only one neuron, which is the output layer for the regression.

ReLU : Stands for Rectified Linear Unit. It is an activation function that returns the input value X if X > 0, else it returns 0. It is a linear function that is zero for negative inputs and increases linearly for positive inputs. Activation functions are applied to the outputs of each neuron in order to add non-linearity and allow the network to recognise complex patterns and relationships.

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


input_dim = X_train.shape[1] # Number of Input Features

def create_model():
  model = Sequential()
  model.add(Dense(64, input_dim=X_train.shape[1], kernel_initializer='normal',activation='relu'))
  model.add(Dense(32, kernel_initializer='normal'))
  model.add(Dense(1, kernel_initializer='normal'))
  model.compile(loss='mean_absolute_error', optimizer='adam')
  return model

In [None]:
dnn_model = create_model()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.20, random_state=50)

In [None]:
X_train = X_train.astype(float)
y_train = y_train.astype(float)
X_test = X_test.astype(float)
y_test = y_test.astype(float)

Now, we train the deep model using the training data. The epochs parameter indicates the number of times the training dataset is passed forward and backward through the neural network during training. In each epoch, the forward pass computes the predictions and the backward pass updates the weights.

In [None]:
history = dnn_model.fit(X_train, y_train, validation_split=0.2, epochs=100, batch_size=5000)

In [None]:
dnn_acc = metrics.r2_score(y_pred, y_test)*100
print("Deep Neural Network accuracy - ",dnn_acc)

In [None]:
y_pred = dnn_model.predict(X_test)

In [None]:
print("MAE" , metrics.mean_absolute_error(y_test, y_pred))
print("MSE" , metrics.mean_squared_error(y_test, y_pred))
print("RMSE" , np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print("R2" , metrics.explained_variance_score(y_test, y_pred))

In [None]:
plt.figure(figsize=(20,8))
plt.title("Deep Neural Network Predictions")
plt.plot(dnn_model.predict(X_test[200:300]), label="prediction", linewidth=2.0,color='blue')
plt.plot(y_test[200:300].values, label="real_values", linewidth=2.0,color='lightcoral')
plt.savefig('dnn_real_pred.png')
plt.legend(loc="best")

# **Comparing the Models**

In [None]:
acc = {'model':['lr_acc', 'xgb_acc','dnn_acc', 'knn_acc', 'rf_acc'],'accuracy':[lr_acc,xgb_acc,dnn_acc, knn_acc, rf_acc]}
acc_df = pd.DataFrame(acc)

In [None]:
plt.figure(figsize=(4,6))
sns.barplot(x='model',y='accuracy',data=acc_df)
plt.savefig('compared_models.png')
plt.show()