In [14]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Ensure plots display in notebook
%matplotlib inline

# Ensure figure folder exists
os.makedirs("../reports/figures", exist_ok=True)


In [15]:
# Load data
df = pd.read_csv("../data/retail_features.csv")

# Convert date column
df['date'] = pd.to_datetime(df['date'])

# Quick check
print(df.head())
print(df.columns.tolist())


        date sku_id category  price  promotion  holiday  demand  day_of_week  \
0 2024-01-31  SKU_1   Sports  32.44          1        0      94            2   
1 2024-02-01  SKU_1   Sports  16.93          0        0      64            3   
2 2024-02-02  SKU_1   Sports  24.51          0        0      69            4   
3 2024-02-03  SKU_1   Sports  26.79          0        0      75            5   
4 2024-02-04  SKU_1   Sports  58.54          0        0      61            6   

   month  quarter  day_of_year  week_of_year  lag_1  lag_7  lag_14  lag_30  \
0      1        1           31             5   54.0   72.0    52.0    91.0   
1      2        1           32             5   94.0   64.0    55.0    43.0   
2      2        1           33             5   64.0   63.0    55.0    47.0   
3      2        1           34             5   69.0   61.0    50.0    66.0   
4      2        1           35             5   75.0   73.0    78.0    45.0   

   rolling_mean_7  rolling_mean_30  
0       63.57

In [16]:
# Sort by SKU + date
df = df.sort_values(['sku_id', 'date'])

# Drop any rows with NaNs (from lag/rolling features)
df = df.dropna().copy()

# Define target and features
target = 'demand'
feature_cols = [
    'price',
    'promotion',
    'lag_7',
    'lag_14',
    'rolling_mean_7',
    'rolling_mean_30',
    'day_of_week',
    'month'
]


In [17]:
# Time-based split: last 90 days as test
split_date = df['date'].max() - pd.Timedelta(days=90)
train = df[df['date'] <= split_date]
test = df[df['date'] > split_date]

# Features and target
X_train = train[feature_cols]
y_train = train[target]

X_test = test[feature_cols]
y_test = test[target]

# Baseline naive forecast: lag 7
test['naive_forecast'] = test['lag_7']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['naive_forecast'] = test['lag_7']


In [18]:
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    # Safe MAPE
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 1, y_true))) * 100
    return mae, rmse, mape


In [19]:
baseline_metrics = evaluate(y_test, test['naive_forecast'])
print("Baseline Metrics (Naive):", baseline_metrics)


Baseline Metrics (Naive): (14.167777777777777, np.float64(18.251410904365724), np.float64(25.92964018309532))


In [20]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
lr_metrics = evaluate(y_test, lr_pred)

# Random Forest
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
rf_metrics = evaluate(y_test, rf_pred)

# XGBoost
xgb = XGBRegressor(n_estimators=300, learning_rate=0.05, max_depth=6, random_state=42)
xgb.fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_metrics = evaluate(y_test, xgb_pred)


In [21]:
results = pd.DataFrame({
    'Model': ['Baseline', 'Linear Regression', 'Random Forest', 'XGBoost'],
    'MAE': [baseline_metrics[0], lr_metrics[0], rf_metrics[0], xgb_metrics[0]],
    'RMSE': [baseline_metrics[1], lr_metrics[1], rf_metrics[1], xgb_metrics[1]],
    'MAPE': [baseline_metrics[2], lr_metrics[2], rf_metrics[2], xgb_metrics[2]]
})

# Sort by MAPE
results = results.sort_values('MAPE')
print(results)

# Improvement vs baseline
improvement = ((baseline_metrics[2] - results.iloc[0]['MAPE']) / baseline_metrics[2]) * 100
print(f"Improvement vs Baseline: {improvement:.2f}%")


               Model        MAE       RMSE       MAPE
3            XGBoost   5.908121   8.205925  10.568572
2      Random Forest   6.313839   8.641599  11.315891
1  Linear Regression   6.547825   8.819418  12.796019
0           Baseline  14.167778  18.251411  25.929640
Improvement vs Baseline: 59.24%


In [22]:
# Store best predictions in test
test['prediction'] = xgb_pred  # using XGBoost as best model
test['error'] = abs(test['demand'] - test['prediction'])

plt.figure(figsize=(12,6))
plt.plot(test['date'], test['demand'], label='Actual')
plt.plot(test['date'], test['prediction'], label='Forecast')
plt.legend()
plt.title("Forecast vs Actual Demand")
plt.xlabel("Date")
plt.ylabel("Demand")
plt.savefig("../reports/figures/forecast_vs_actual.png")
plt.close()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['prediction'] = xgb_pred  # using XGBoost as best model
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['error'] = abs(test['demand'] - test['prediction'])


In [23]:
# Average error per SKU
sku_error = test.groupby('sku_id')['error'].mean().sort_values(ascending=False)
print(sku_error.head())

# Error during promotions
promo_error = test.groupby('promotion')['error'].mean()
print("Error by Promotion Flag:")
print(promo_error)

# Low-volume SKU stability
sku_volume = test.groupby('sku_id')['demand'].mean()
analysis = pd.concat([sku_volume, sku_error], axis=1)
analysis.columns = ['avg_demand', 'avg_error']
print(analysis.sort_values('avg_demand').head())


sku_id
SKU_38    10.263937
SKU_2      8.715967
SKU_1      7.168891
SKU_13     6.836419
SKU_15     6.644244
Name: error, dtype: float64
Error by Promotion Flag:
promotion
0    5.884956
1    6.118155
Name: error, dtype: float64
        avg_demand  avg_error
sku_id                       
SKU_18   29.466667   5.120433
SKU_47   31.077778   5.156294
SKU_27   36.433333   6.301154
SKU_39   41.355556   6.144476
SKU_23   41.788889   5.205267


In [1]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print("MAE:", mae)
print("RMSE:", rmse)
print("MAPE:", mape)

NameError: name 'y_test' is not defined

In [2]:
target = "demand"

X = df.drop(columns=[target])
y = df[target]

NameError: name 'df' is not defined

In [3]:
import pandas as pd
import numpy as np


In [4]:
df = pd.read_csv("../data/retail_demand_data.csv")

In [5]:
df = pd.read_csv("../data/processed_features.csv")

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed_features.csv'