In [4]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

In [5]:
df = pd.read_csv('../data/processed/walmart_sales_cleaned.csv')
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Week
0,1,2010-02-05,1643690.9,0,42.31,2.572,211.096358,8.106,2010,2,5
1,10,2010-02-05,2193048.75,0,54.34,2.962,126.442065,9.765,2010,2,5
2,37,2010-02-05,536006.73,0,45.97,2.572,209.852966,8.554,2010,2,5
3,17,2010-02-05,789036.02,0,23.11,2.666,126.442065,6.548,2010,2,5
4,30,2010-02-05,465108.52,0,39.05,2.572,210.752605,8.324,2010,2,5


In [6]:
features = ['Store', 'Holiday_Flag', 'Temperature', 'Fuel_Price',
            'CPI', 'Unemployment', 'Year', 'Month', 'Week']

target = 'Weekly_Sales'

X = df[features]
y = df[target]

In [7]:
X = pd.get_dummies(X, columns=['Store'], drop_first=True)

X.head()

#store id are categorical, which we transformed using one hot encoding for regression model

Unnamed: 0,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Week,Store_2,Store_3,...,Store_36,Store_37,Store_38,Store_39,Store_40,Store_41,Store_42,Store_43,Store_44,Store_45
0,0,42.31,2.572,211.096358,8.106,2010,2,5,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,54.34,2.962,126.442065,9.765,2010,2,5,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,45.97,2.572,209.852966,8.554,2010,2,5,False,False,...,False,True,False,False,False,False,False,False,False,False
3,0,23.11,2.666,126.442065,6.548,2010,2,5,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,39.05,2.572,210.752605,8.324,2010,2,5,False,False,...,False,False,False,False,False,False,False,False,False,False


In [22]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")

Training samples: 5148
Testing samples: 1287


In [23]:
#Training the Linear Model
model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [29]:
#Evaluating Linear Regression Model
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.3f}")
print(f"Root Mean Squared Error (RMSE): {rmse:,.2f}")

R² Score: 0.938
Root Mean Squared Error (RMSE): 139,035.77


In [30]:
#train the RF model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
rf_pred = rf_model.predict(X_test)

In [31]:
# Evaluating RF Model
rf_r2 = r2_score(y_test, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))

print(f"Random Forest R² Score: {rf_r2:.3f}")
print(f"Random Forest RMSE: {rf_rmse:,.2f}")

Random Forest R² Score: 0.951
Random Forest RMSE: 124,048.14


In [32]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X, y, cv=5, scoring='neg_root_mean_squared_error')
print("Average RMSE:", -scores.mean())

Average RMSE: 155788.40761048015
