## Import libraries

In [19]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [20]:
# import data
data = pd.read_csv(r'Y:\AFRICDSA\MACHINE LEARNING\DATA SETS\Walmart.csv')
data.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


In [21]:
## check our data info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         6435 non-null   int64  
 1   Date          6435 non-null   object 
 2   Weekly_Sales  6435 non-null   float64
 3   Holiday_Flag  6435 non-null   int64  
 4   Temperature   6435 non-null   float64
 5   Fuel_Price    6435 non-null   float64
 6   CPI           6435 non-null   float64
 7   Unemployment  6435 non-null   float64
dtypes: float64(5), int64(2), object(1)
memory usage: 402.3+ KB


In [22]:
# check for duplicates
data.duplicated().sum()

0

## pandas profiling

In [23]:
from ydata_profiling import ProfileReport

In [24]:
profile = ProfileReport(data, title="Walmart Profiling Report", explorative=True)

In [25]:
profile

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [26]:
data.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1,05-02-2010,1643690.9,0,42.31,2.572,211.096358,8.106
1,1,12-02-2010,1641957.44,1,38.51,2.548,211.24217,8.106
2,1,19-02-2010,1611968.17,0,39.93,2.514,211.289143,8.106
3,1,26-02-2010,1409727.59,0,46.63,2.561,211.319643,8.106
4,1,05-03-2010,1554806.68,0,46.5,2.625,211.350143,8.106


## Feature Engineer the data

In [27]:
# create Day, Month and Year columns
data['Date'] = pd.to_datetime(data['Date'], format='%d-%m-%Y')
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month
data['Year'] = data['Date'].dt.year

# drop Date column
data.drop('Date', axis=1, inplace=True)
data.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Day,Month,Year
0,1,1643690.9,0,42.31,2.572,211.096358,8.106,5,2,2010
1,1,1641957.44,1,38.51,2.548,211.24217,8.106,12,2,2010
2,1,1611968.17,0,39.93,2.514,211.289143,8.106,19,2,2010
3,1,1409727.59,0,46.63,2.561,211.319643,8.106,26,2,2010
4,1,1554806.68,0,46.5,2.625,211.350143,8.106,5,3,2010


## Scale the data
## add your code here
# def feature_engineer(data):

In [28]:
# createing the dependent variable
X = data.drop('Unemployment', axis=1)
y = data['Unemployment']

# split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Machine learning 

#### Problem Statement
- predict Unemployment Rate in using walmart data

In [29]:
# base model - Linear Regression

from sklearn.linear_model import LinearRegression

# instantiate the model
base_model = LinearRegression()

# fit the model
base_model.fit(X_train, y_train)

# score the model
base_model_score = base_model.score(X_test, y_test)
# score the model
print('Base Model Score', round(base_model_score*100), '%')

Base Model Score 22 %


In [30]:
# score of training set
base_model_train_score = base_model.score(X_train, y_train)

print('Base Model Train Score', round(base_model_train_score*100), '%')

Base Model Train Score 23 %


In [31]:
## fit ridge and lasso regression models and compare the scores with the base model
from sklearn.linear_model import Ridge, Lasso

# instantiate the models
ridge_model = Ridge(alpha=0.5)
lasso_model = Lasso(alpha=0.01)

# fit the models
ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)

# score the models
ridge_model_score = ridge_model.score(X_test, y_test)
lasso_model_score = lasso_model.score(X_test, y_test)

# score the models
print('Ridge Model Score', round(ridge_model_score*100), '%')
print('Lasso Model Score', round(lasso_model_score*100), '%')

Ridge Model Score 22 %
Lasso Model Score 22 %


In [32]:
## fit a decision tree model and a random forest model and compare the scores with the base model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# instantiate the models
dt_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()

# fit the models
dt_model.fit(X_train, y_train)
rf_model.fit(X_train, y_train)

# score the models
dt_model_score = dt_model.score(X_test, y_test)
rf_model_score = rf_model.score(X_test, y_test)

# score the models
print('Decision Tree Model Score', round(dt_model_score*100), '%')
print('Random Forest Model Score', round(rf_model_score*100), '%')

Decision Tree Model Score 90 %
Random Forest Model Score 98 %


In [33]:
# check the accuracy on the training set
dt_model_train_score = dt_model.score(X_train, y_train)
rf_model_train_score = rf_model.score(X_train, y_train)

print('Decision Tree Model Train Score', round(dt_model_train_score*100), '%')
print('Random Forest Model Train Score', round(rf_model_train_score*100), '%')

Decision Tree Model Train Score 100 %
Random Forest Model Train Score 100 %


## making predictions

In [34]:
X.sample(5, random_state=0)

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Day,Month,Year
3949,28,1220984.94,0,67.31,3.805,129.770645,14,10,2011
4555,32,1246322.44,0,71.14,3.741,197.655186,8,6,2012
2235,16,505918.21,0,41.97,3.55,195.331898,28,10,2011
5204,37,527572.25,0,64.55,3.288,212.576205,4,3,2011
3524,25,719235.07,0,44.81,3.53,210.98102,11,11,2011


In [35]:
# make predictions using the random forest model and compare with the actual values
sample_data = X.sample(5, random_state=0)
sample_data['Predicted_Unemployment'] = rf_model.predict(sample_data)
sample_data['Actual_Unemployment'] = y.sample(5, random_state=0)
sample_data

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Day,Month,Year,Predicted_Unemployment,Actual_Unemployment
3949,28,1220984.94,0,67.31,3.805,129.770645,14,10,2011,12.781,12.89
4555,32,1246322.44,0,71.14,3.741,197.655186,8,6,2012,8.08346,8.09
2235,16,505918.21,0,41.97,3.55,195.331898,28,10,2011,6.24578,6.232
5204,37,527572.25,0,64.55,3.288,212.576205,4,3,2011,8.395,8.395
3524,25,719235.07,0,44.81,3.53,210.98102,11,11,2011,7.082,7.082


In [38]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Create the CatBoost model
catboost_model = CatBoostRegressor()

# Fit the model
catboost_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = catboost_model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Learning rate set to 0.052503
0:	learn: 1.8185581	total: 148ms	remaining: 2m 28s
1:	learn: 1.7801462	total: 152ms	remaining: 1m 15s
2:	learn: 1.7260473	total: 157ms	remaining: 52.1s
3:	learn: 1.6849137	total: 160ms	remaining: 39.8s
4:	learn: 1.6464423	total: 162ms	remaining: 32.2s
5:	learn: 1.6077540	total: 165ms	remaining: 27.4s
6:	learn: 1.5773112	total: 172ms	remaining: 24.5s
7:	learn: 1.5509013	total: 176ms	remaining: 21.8s
8:	learn: 1.5136883	total: 178ms	remaining: 19.6s
9:	learn: 1.4814532	total: 186ms	remaining: 18.4s
10:	learn: 1.4513135	total: 200ms	remaining: 18s
11:	learn: 1.4217219	total: 205ms	remaining: 16.8s
12:	learn: 1.3913305	total: 211ms	remaining: 16s
13:	learn: 1.3674010	total: 215ms	remaining: 15.2s
14:	learn: 1.3456410	total: 218ms	remaining: 14.3s
15:	learn: 1.3264952	total: 222ms	remaining: 13.7s
16:	learn: 1.3003735	total: 226ms	remaining: 13.1s
17:	learn: 1.2830034	total: 229ms	remaining: 12.5s
18:	learn: 1.2639663	total: 232ms	remaining: 12s
19:	learn: 1.24

In [41]:
corr_matrix = data.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(corr_matrix)
plt.show()

In [43]:
# heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='viridis')
plt.show()
