In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from src.config import DATA_PROCESSED
from src.feature_engineering import (
    add_sku_rolling_stats,
    add_time_aware_category_te,
)

def mape(y_true, y_pred):
    mask = y_true > 0
    return np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask]) * 100) if mask.any() else 0.0

def wape(y_true, y_pred):
    denom = np.abs(y_true).sum()
    return np.abs(y_true - y_pred).sum() / denom * 100 if denom > 0 else 0.0

train_df = pd.read_csv(DATA_PROCESSED / "train_data.csv")
test_df = pd.read_csv(DATA_PROCESSED / "test_data.csv")

# ensure date column is datetime
test_df["date"] = pd.to_datetime(test_df["date"])

FEATURES = [
    "day_of_week",
    "is_weekend",
    "day_of_year",
    "qty_lag_1",
    "qty_lag_7",
    "qty_lag_14",
    "qty_roll_7",
    "qty_roll_14",
    "is_holiday",
    "days_to_holiday",
    "days_since_holiday",
    "is_fasting"
]

TARGET = "target_qty"
train_df = train_df.sort_values(["sku_id", "date"])
test_df = test_df.sort_values(["sku_id", "date"])

train_df = add_sku_rolling_stats(train_df, 28)
test_df = add_sku_rolling_stats(test_df, 28)

train_df = add_time_aware_category_te(train_df, TARGET)
test_df = add_time_aware_category_te(test_df, TARGET)

FINAL_FEATURES = FEATURES + [
    "sku_avg_28d",
    "sku_std_28d",
    "category_te",
]

X_train = train_df[FINAL_FEATURES]
y_train = train_df[TARGET]

X_test = test_df[FINAL_FEATURES]
y_test = test_df[TARGET]

# Impute missing values (fit imputer on training set only)
imputer = SimpleImputer(strategy="median")
X_train_imp = pd.DataFrame(imputer.fit_transform(X_train), columns=FINAL_FEATURES, index=X_train.index)
X_test_imp = pd.DataFrame(imputer.transform(X_test), columns=FINAL_FEATURES, index=X_test.index)

model = LinearRegression()
model.fit(X_train_imp, y_train)

# predictions and attach to test_df
y_pred = model.predict(X_test_imp)
test_df = test_df.copy()
test_df.loc[X_test_imp.index, "prediction"] = y_pred

# --- Aggregate to daily totals and compute metrics ---
daily_agg = test_df.groupby("date").agg(actual=("target_qty", "sum"), pred=("prediction", "sum")).reset_index()

daily_mae = mean_absolute_error(daily_agg["actual"], daily_agg["pred"])
daily_mape = mape(daily_agg["actual"], daily_agg["pred"])
daily_wape = wape(daily_agg["actual"], daily_agg["pred"])

# --- Aggregate to monthly totals and compute metrics ---
monthly_agg = (
    test_df
    .groupby(test_df["date"].dt.to_period("M"))
    .agg(actual=("target_qty", "sum"), pred=("prediction", "sum"))
    #.mean()
    .reset_index()
)

monthly_mae = mean_absolute_error(monthly_agg["actual"], monthly_agg["pred"])
monthly_mape = mape(monthly_agg["actual"], monthly_agg["pred"])
monthly_wape = wape(monthly_agg["actual"], monthly_agg["pred"])

# Print only the requested totals
print("Daily totals — MAE: {:.3f}, MAPE: {:.2f}%, WAPE: {:.2f}%".format(daily_mae, daily_mape, daily_wape))
print("Monthly totals — MAE: {:.3f}, MAPE: {:.2f}%, WAPE: {:.2f}%".format(monthly_mae, monthly_mape, monthly_wape))


Daily totals — MAE: 990.589, MAPE: 36.44%, WAPE: 33.83%
Monthly totals — MAE: 10569.832, MAPE: 13.64%, WAPE: 12.82%


In [7]:
# Save the trained model (joblib dump of the fitted estimator only)
try:
    from joblib import dump
    from src.config import MODELS_DIR

    MODELS_DIR.mkdir(parents=True, exist_ok=True)
    model_path = MODELS_DIR / "linear_regression.joblib"
    dump(model, model_path)
    print(f"Saved model to {model_path}")
except NameError:
    print("No trained 'model' found - run the training cell first.")
except Exception as e:
    print(f"Model save failed: {e}")

Saved model to C:\Users\P A V I L I O N\Desktop\IML Demand forcasting\Demand_Forcasting\models\linear_regression.joblib
