In [None]:
# Import necessary libraries
import pandas as pd  # For data manipulation
import numpy as np   # For numerical computations
import matplotlib.pyplot as plt  # For visualization
import seaborn as sns  # For advanced plotting
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.linear_model import LinearRegression  # For regression modeling
from sklearn.metrics import mean_absolute_error, mean_squared_error  # For model evaluation


### **1. Loading the Dataset**


In [None]:
import pandas as pd
from google.colab import files

uploaded = files.upload()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
walmart_data = pd.read_csv("Walmart.csv")

In [None]:
# I then selected the variables i needed from the dataset
selected_data = ['Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
selected_data = walmart_data[selected_data]
selected_data.head()

Unnamed: 0,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,1643690.9,0,42.31,2.572,211.096358,8.106
1,1641957.44,1,38.51,2.548,211.24217,8.106
2,1611968.17,0,39.93,2.514,211.289143,8.106
3,1409727.59,0,46.63,2.561,211.319643,8.106
4,1554806.68,0,46.5,2.625,211.350143,8.106


### **2. Data Cleaning**

In [None]:
# Checking for missing values in each variable
missing_values = selected_data.isnull().sum()
print("Missing values per variable:\n", missing_values)

Missing values per variable:
 Weekly_Sales    0
Holiday_Flag    0
Temperature     0
Fuel_Price      0
CPI             0
Unemployment    0
dtype: int64


In [None]:
# Convert 'Holiday_Flag' into categorical (one-hot encoding not needed since it's binary)
selected_data['Holiday_Flag'] = selected_data['Holiday_Flag'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_data['Holiday_Flag'] = selected_data['Holiday_Flag'].astype(int)


### **3. Descriptive Analysis**

In [None]:
# Since Holiday_Flag is a categorical variable, i removed it from the dataframe before running the descriptive analysis

desc_stats = selected_data.drop(columns=['Holiday_Flag']).describe()
print("\nDescriptive Statistics:\n")

desc_stats


Descriptive Statistics:



Unnamed: 0,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment
count,6435.0,6435.0,6435.0,6435.0,6435.0
mean,1046965.0,60.663782,3.358607,171.578394,7.999151
std,564366.6,18.444933,0.45902,39.356712,1.875885
min,209986.2,-2.06,2.472,126.064,3.879
25%,553350.1,47.46,2.933,131.735,6.891
50%,960746.0,62.67,3.445,182.616521,7.874
75%,1420159.0,74.94,3.735,212.743293,8.622
max,3818686.0,100.14,4.468,227.232807,14.313


In [None]:
corr_matrix = selected_data.drop(columns=['Holiday_Flag']).corr()
print("\nCorrelation Matrix:\n")

corr_matrix


Correlation Matrix:



Unnamed: 0,Weekly_Sales,Temperature,Fuel_Price,CPI,Unemployment
Weekly_Sales,1.0,-0.06381,0.009464,-0.072634,-0.106176
Temperature,-0.06381,1.0,0.144982,0.176888,0.101158
Fuel_Price,0.009464,0.144982,1.0,-0.170642,-0.034684
CPI,-0.072634,0.176888,-0.170642,1.0,-0.30202
Unemployment,-0.106176,0.101158,-0.034684,-0.30202,1.0


### =========================
### **4. Define Features and Target Variable**

In [None]:
X = selected_data[['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Holiday_Flag']]  # Independent variables
y = selected_data['Weekly_Sales']  # Target variable

**Training the Linear Regression Model**

In [None]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### **5. Fit Linear Regression Model**

In [None]:
import statsmodels.api as sm

# Fit the regression model without adding a constant
model = sm.OLS(y_train, X_train).fit()

# Print the model summary
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:           Weekly_Sales   R-squared (uncentered):                   0.764
Model:                            OLS   Adj. R-squared (uncentered):              0.763
Method:                 Least Squares   F-statistic:                              3324.
Date:                Sun, 16 Mar 2025   Prob (F-statistic):                        0.00
Time:                        01:13:12   Log-Likelihood:                         -75612.
No. Observations:                5148   AIC:                                  1.512e+05
Df Residuals:                    5143   BIC:                                  1.513e+05
Df Model:                           5                                                  
Covariance Type:            nonrobust                                                  
                   coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------

### =========================
### **6. Model Evaluation**


In [None]:
# Predict on test data
y_pred = model.predict(X_test)

# Calculate MAE
mae = mean_absolute_error(y_test, y_pred)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print results
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Square Error (RMSE): {rmse:.2f}")

Mean Absolute Error (MAE): 469043.07
Root Mean Square Error (RMSE): 569992.55
