# Loading Data

In [1]:
import numpy as np
import pandas as pd

data = pd.read_csv("walmart_sales_data.csv")
data.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


# EDA (Exploratory Data Analysis)

In [2]:
# printing column names
print(f"Column names in dataset are: {list(data.columns)} \n")

# printing shape of data
print(f"Shape of dataset is: {data.shape} \n")

# printing data type of every column in dataset
print(f"Description of columns: \n{data.dtypes} \n")

# checking missing values in every column
print(f"Missing Values: \n{data.isnull().sum()} \n")

Column names in dataset are: ['Store', 'Date', 'Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment'] 

Shape of dataset is: (6435, 8) 

Description of columns: 
Store             int64
Date             object
Weekly_Sales    float64
Holiday_Flag      int64
Temperature     float64
Fuel_Price      float64
CPI             float64
Unemployment    float64
dtype: object 

Missing Values: 
Store           0
Date            0
Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64 



# Splitting (Target and Features, Train and Test)

In [3]:
x = data.drop(columns = ['Date', 'Weekly_Sales'],axis=1)
y = data['Weekly_Sales']
print(f"Shape of features: {x.shape}")
print(f"Shape of target: {y.shape}")

Shape of features: (6435, 6)
Shape of target: (6435,)


In [4]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
print(f"Shape of x_train: {x_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of x_test: {x_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of x_train: (5148, 6)
Shape of y_train: (5148,)
Shape of x_test: (1287, 6)
Shape of y_test: (1287,)


# Model Training and Validation

In [5]:
from xgboost import XGBRegressor
model_xgboost = XGBRegressor(random_state = 42)
model_xgboost.fit(x_train, y_train)
prediction_x_test_XGBRegressor = model_xgboost.predict(x_test)

from sklearn.linear_model import LinearRegression
model_LinearRegression = LinearRegression()
model_LinearRegression.fit(x_train, y_train)
prediction_x_test_LinearRegression = model_LinearRegression.predict(x_test)

from sklearn.ensemble import RandomForestRegressor
model_RandomForestRegressor = RandomForestRegressor()
model_RandomForestRegressor.fit(x_train, y_train)
prediction_x_test_RandomForestRegressor = model_RandomForestRegressor.predict(x_test)

from sklearn.metrics import r2_score, mean_squared_log_error
print(f"r2_score of LinearRegression on test set is: {r2_score(y_test, prediction_x_test_LinearRegression)}")
print(f"r2_score of RandomForestRegressor on test set is: {r2_score(y_test, prediction_x_test_RandomForestRegressor)}")
print(f"r2_score of XGBRegressor on test set is: {r2_score(y_test, prediction_x_test_XGBRegressor)}")

r2_score of LinearRegression on test set is: 0.11588957331942318
r2_score of RandomForestRegressor on test set is: 0.9378112081877883
r2_score of XGBRegressor on test set is: 0.9588203554873219


# Making Prediction System

In [6]:
x_train.head()

Unnamed: 0,Store,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
99,1,1,44.55,3.129,219.53599,7.866
5641,40,0,52.68,3.988,134.435733,4.781
1970,14,0,59.45,3.9,190.536321,8.424
179,2,0,65.8,2.72,211.471329,8.163
3300,24,0,51.07,3.021,132.022667,8.211


In [7]:
y_train

99      1497462.72
5641     847246.50
1970    1893447.71
179     1794355.49
3300    1264117.01
           ...    
1099    1462254.05
2514     951549.61
3606    1042226.30
5704     979848.71
2575    1536549.95
Name: Weekly_Sales, Length: 5148, dtype: float64

In [8]:
input_data = np.asarray([1, 1, 44.55, 3.129, 219.535990, 7.866]).reshape(1,-1)
model_RandomForestRegressor.predict(input_data)[0]



1680926.0978999992

In [9]:
model_LinearRegression.predict(input_data)[0]



1386900.72924603

In [10]:
model_xgboost.predict(input_data)[0]

1543521.5

# Exporting Models

In [11]:
import pickle 
pickle.dump(model_xgboost, open("model_xgboost_predictor.pkl", "wb"))

In [12]:
pickle.dump(model_LinearRegression, open("model_LinearRegression_predictor.pkl", "wb"))

In [13]:
pickle.dump(model_RandomForestRegressor, open("model_RandomForestRegressor_predictor.pkl", "wb"))