In [6]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Load data
df = pd.read_csv("grocery_chain_data.csv")
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Aggregate daily sales - Include Aisle
daily = df.groupby(['transaction_date', 'product_name', 'store_name', 'aisle']).agg({
    'quantity': 'sum',
    'unit_price': 'mean'
}).reset_index().sort_values('transaction_date')

# Encode categorical variables
prod_enc = LabelEncoder()
store_enc = LabelEncoder()
aisle_enc = LabelEncoder()

daily['product_enc'] = prod_enc.fit_transform(daily['product_name'])
daily['store_enc'] = store_enc.fit_transform(daily['store_name'])
daily['aisle_enc'] = aisle_enc.fit_transform(daily['aisle'])

# Lag and Rolling Features
for lag in [1, 7, 30]:
    daily[f'lag_{lag}'] = daily.groupby(['product_name', 'store_name'])['quantity'].shift(lag)

for window in [7, 30]:
    daily[f'roll_mean_{window}'] = daily.groupby(['product_name', 'store_name'])['quantity'].shift(1).rolling(window).mean()

daily.fillna(0, inplace=True)

# Date features
daily['day_of_week'] = daily['transaction_date'].dt.weekday
daily['month'] = daily['transaction_date'].dt.month
daily['day'] = daily['transaction_date'].dt.day

# Features selection
feature_cols = ['product_enc', 'store_enc', 'aisle_enc', 'unit_price', 'day_of_week', 'month', 'day', 
                'lag_1', 'lag_7', 'lag_30', 'roll_mean_7', 'roll_mean_30']
X = daily[feature_cols]
y = daily['quantity']

# Split (Time-series split: no shuffle)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Train XGBoost with improved parameters
model = XGBRegressor(
    n_estimators=500, 
    max_depth=8, 
    learning_rate=0.05, 
    subsample=0.8,
    colsample_bytree=0.8,
    n_jobs=-1,
    random_state=42
)
model.fit(X_train, y_train)

# Save
artifacts = {
    'model': model,
    'prod_enc': prod_enc,
    'store_enc': store_enc,
    'aisle_enc': aisle_enc,
    'feature_cols': feature_cols
}
with open('sales_model_xgb.pkl', 'wb') as f:
    pickle.dump(artifacts, f)

print(f"Improved MAE: {mean_absolute_error(y_test, model.predict(X_test)):.4f}")

Improved MAE: 1.2612
