import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
df = pd.read_csv("../data/processed/retail_features.csv")

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['sku', 'date'])
target = 'sales'

feature_cols = [
    'price',
    'promotion_flag',
    'lag_7',
    'lag_14',
    'rolling_mean_7',
    'rolling_mean_30',
    'day_of_week',
    'month'
]
split_date = df['date'].quantile(0.8)

train = df[df['date'] <= split_date]
test = df[df['date'] > split_date]

X_train = train[feature_cols]
y_train = train[target]

X_test = test[feature_cols]
y_test = test[target]
test['naive_forecast'] = test['lag_7']
def evaluate(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return mae, rmse, mape

baseline_metrics = evaluate(y_test, test['naive_forecast'])
baseline_metrics
lr = LinearRegression()
lr.fit(X_train, y_train)

lr_pred = lr.predict(X_test)

lr_metrics = evaluate(y_test, lr_pred)
rf = RandomForestRegressor(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)

rf_pred = rf.predict(X_test)

rf_metrics = evaluate(y_test, rf_pred)
xgb = XGBRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    random_state=42
)

xgb.fit(X_train, y_train)

xgb_pred = xgb.predict(X_test)

xgb_metrics = evaluate(y_test, xgb_pred)
results = pd.DataFrame({
    'Model': ['Baseline', 'Linear Regression', 'Random Forest', 'XGBoost'],
    'MAE': [
        baseline_metrics[0],
        lr_metrics[0],
        rf_metrics[0],
        xgb_metrics[0]
    ],
    'RMSE': [
        baseline_metrics[1],
        lr_metrics[1],
        rf_metrics[1],
        xgb_metrics[1]
    ],
    'MAPE': [
        baseline_metrics[2],
        lr_metrics[2],
        rf_metrics[2],
        xgb_metrics[2]
    ]
})

results
plt.figure(figsize=(12,6))
plt.plot(test['date'], y_test, label='Actual')
plt.plot(test['date'], xgb_pred, label='Forecast')
plt.legend()
plt.title("Forecast vs Actual Demand")
plt.show()
plt.savefig("../reports/figures/forecast_vs_actual.png")
baseline_mape = baseline_metrics[2]
best_mape = min(results['MAPE'])

improvement = ((baseline_mape - best_mape) / baseline_mape) * 100
print(f"Improvement vs Baseline: {improvement:.2f}%")
test['prediction'] = xgb_pred
test['error'] = abs(test['sales'] - test['prediction'])

sku_error = test.groupby('sku')['error'].mean().sort_values(ascending=False)

sku_error.head()
