# M5 Walmart Sales Forecasting with Prophet

Complete EDA and Prophet training notebook for demand forecasting

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Prophet and metrics
from prophet import Prophet
from sklearn.metrics import mean_absolute_percentage_error
import pickle
import json

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

print('All libraries imported successfully')

## Load Dataset from Local Path

In [None]:
# Define dataset paths (relative to this notebook location)
DATASET_PATH = Path('../datasets/m5-forecasting-accuracy')

calendar_file = DATASET_PATH / 'calendar.csv'
sales_file = DATASET_PATH / 'sales_train_validation.csv'
prices_file = DATASET_PATH / 'sell_prices.csv'

print(f'Dataset path: {DATASET_PATH.absolute()}')
print(f'Calendar file exists: {calendar_file.exists()}')
print(f'Sales file exists: {sales_file.exists()}')
print(f'Prices file exists: {prices_file.exists()}')

In [None]:
print('Loading calendar data...')
calendar = pd.read_csv(calendar_file)
print(f'Calendar shape: {calendar.shape}')
print(calendar.head())
print(calendar.info())

In [None]:
print('Loading sales data...')
sales = pd.read_csv(sales_file)
print(f'Sales shape: {sales.shape}')
print(sales.head())
print(sales.info())

In [None]:
print('Loading price data...')
sell_prices = pd.read_csv(prices_file)
print(f'Prices shape: {sell_prices.shape}')
print(sell_prices.head())
print(sell_prices.info())

## Data Transformation & EDA

In [None]:
# Convert from wide to long format
print('Converting sales data from wide to long format...')
sales = sales.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    var_name='d',
    value_name='sales'
)

print(f'After melt shape: {sales.shape}')
print(sales.head())

In [None]:
# Merge with calendar to get dates
print('Merging with calendar data...')
sales = sales.merge(calendar, on='d', how='left')

print(f'After merge shape: {sales.shape}')
print(sales.head())
print(sales.columns.tolist())

In [None]:
# Data exploration
print('=' * 60)
print('DATA STRUCTURE ANALYSIS')
print('=' * 60)

print(f'\nUnique stores: {sales["store_id"].nunique()}')
print(f'Store list: {sorted(sales["store_id"].unique())}')

print(f'\nUnique items: {sales["item_id"].nunique()}')
print(f'\nUnique categories: {sales["cat_id"].nunique()}')
print(f'Categories: {sorted(sales["cat_id"].unique())}')

print(f'\nUnique states: {sales["state_id"].nunique()}')
print(f'States: {sorted(sales["state_id"].unique())}')

print(f'\nDate range: {sales["date"].min()} to {sales["date"].max()}')

In [None]:
# Products per store
products_per_store = sales.groupby('store_id')['item_id'].nunique()
print('\nProducts per store:')
print(products_per_store)

## Focus on Store 1 (Single Store Analysis)

In [None]:
# Filter for Store 1
STORE_ID = 'CA_1'
store_sales = sales[sales['store_id'] == STORE_ID].copy()

print(f'Store {STORE_ID} Analysis:')
print('=' * 60)
print(f'Shape: {store_sales.shape}')
print(f'Products: {store_sales["item_id"].nunique()}')
print(f'Categories: {store_sales["cat_id"].nunique()}')
print(f'Date range: {store_sales["date"].min()} to {store_sales["date"].max()}')

In [None]:
# Sales statistics
print('\n' + '=' * 60)
print('SALES STATISTICS - Store CA_1')
print('=' * 60)

print('\nOverall sales statistics:')
print(store_sales['sales'].describe())

zero_sales = (store_sales['sales'] == 0).sum()
total_records = len(store_sales)
zero_pct = (zero_sales / total_records) * 100
print(f'\nZero sales records: {zero_sales} ({zero_pct:.2f}%)')

print('\nTop 10 products by total sales:')
top_products = store_sales.groupby('item_id')['sales'].sum().nlargest(10)
print(top_products)

In [None]:
# Visualize top products
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Top 10 products
top_products.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Top 10 Products by Total Sales (Store CA_1)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Item ID')
axes[0].set_ylabel('Total Sales')
axes[0].grid(axis='y', alpha=0.3)

# Sales distribution
store_sales['sales'].hist(bins=50, ax=axes[1], color='steelblue', edgecolor='black')
axes[1].set_title('Distribution of Daily Sales', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Daily Sales')
axes[1].set_ylabel('Frequency')
axes[1].set_yscale('log')

plt.tight_layout()
plt.show()

In [None]:
# Time series for top 5 products
top_5_items = store_sales.groupby('item_id')['sales'].sum().nlargest(5).index
print(f'Top 5 items: {list(top_5_items)}')

fig, axes = plt.subplots(5, 1, figsize=(14, 12))

for idx, item_id in enumerate(top_5_items):
    item_data = store_sales[store_sales['item_id'] == item_id].sort_values('date')
    axes[idx].plot(item_data['date'], item_data['sales'], linewidth=1.5, color='steelblue')
    axes[idx].fill_between(item_data['date'], item_data['sales'], alpha=0.3, color='steelblue')
    axes[idx].set_title(f'Item {item_id} - Daily Sales', fontweight='bold')
    axes[idx].set_ylabel('Sales')
    axes[idx].grid(alpha=0.3)

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.show()

In [None]:
# Seasonality analysis
store_sales['date'] = pd.to_datetime(store_sales['date'])
store_sales['day_of_week'] = store_sales['date'].dt.day_name()
store_sales['month'] = store_sales['date'].dt.month

dow_sales = store_sales.groupby('day_of_week')['sales'].mean().reindex(
    ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

dow_sales.plot(kind='bar', ax=axes[0], color='coral')
axes[0].set_title('Average Sales by Day of Week', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Day of Week')
axes[0].set_ylabel('Average Daily Sales')
axes[0].grid(axis='y', alpha=0.3)
plt.setp(axes[0].xaxis.get_majorticklabels(), rotation=45)

month_sales = store_sales.groupby('month')['sales'].mean()
month_sales.plot(kind='bar', ax=axes[1], color='lightgreen')
axes[1].set_title('Average Sales by Month', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Average Daily Sales')
axes[1].grid(axis='y', alpha=0.3)
plt.setp(axes[1].xaxis.get_majorticklabels(), rotation=45)

plt.tight_layout()
plt.show()

## Prepare Data for Prophet

In [None]:
# Select top item for training
TOP_ITEM = top_5_items[0]
print(f'Selected item for training: {TOP_ITEM}')

# Prepare data for Prophet
item_data = store_sales[store_sales['item_id'] == TOP_ITEM].copy()
item_data = item_data.sort_values('date')

prophet_df = item_data[['date', 'sales']].copy()
prophet_df.columns = ['ds', 'y']
prophet_df['ds'] = pd.to_datetime(prophet_df['ds'])
prophet_df = prophet_df.reset_index(drop=True)

print(f'\nDataset shape: {prophet_df.shape}')
print(f'Date range: {prophet_df["ds"].min()} to {prophet_df["ds"].max()}')
print(f'\nFirst 10 rows:')
print(prophet_df.head(10))
print(f'\nSales statistics:')
print(prophet_df['y'].describe())

In [None]:
# Handle missing values and prepare holidays
prophet_df['y'] = prophet_df['y'].fillna(method='ffill').fillna(0)

# Add holidays from calendar
holidays_df = calendar[calendar['event_name_1'].notna()].copy()
holidays_df = holidays_df[['date', 'event_name_1']].rename(
    columns={'date': 'ds', 'event_name_1': 'holiday'}
)
holidays_df['ds'] = pd.to_datetime(holidays_df['ds'])

print(f'Holidays included: {len(holidays_df)}')
print(holidays_df.head(10))

## Train Prophet Model

In [None]:
print('\n' + '=' * 60)
print('TRAINING PROPHET MODEL')
print('=' * 60)

model = Prophet(
    yearly_seasonality=True,
    weekly_seasonality=True,
    daily_seasonality=False,
    seasonality_mode='additive',
    interval_width=0.95,
    holidays=holidays_df
)

print('Model configuration:')
print('  - Yearly seasonality: True')
print('  - Weekly seasonality: True')
print('  - Seasonality mode: Additive')
print('  - Confidence interval: 95%')

print(f'\nTraining on {len(prophet_df)} observations...')
model.fit(prophet_df)
print('Model trained successfully!')

In [None]:
# Make forecast
FORECAST_DAYS = 30
future = model.make_future_dataframe(periods=FORECAST_DAYS)

print(f'\nForecast period: {FORECAST_DAYS} days')
print(f'Last training date: {prophet_df["ds"].max()}')
print(f'Last forecast date: {future["ds"].max()}')

print('\nGenerating forecast...')
forecast = model.predict(future)
print('Forecast generated!')

print(f'\nForecast results (last 35 rows):')
print(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(35))

In [None]:
# Visualize forecast
fig = model.plot(forecast, figsize=(14, 8))
plt.title(f'Prophet Forecast - {TOP_ITEM} (30-Day Forecast with 95% CI)', 
          fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.tight_layout()
plt.show()

In [None]:
# Components plot
fig = model.plot_components(forecast, figsize=(14, 10))
plt.tight_layout()
plt.show()

## Model Evaluation

In [None]:
# Calculate metrics
forecast_hist = forecast[forecast['ds'] <= prophet_df['ds'].max()].copy()
eval_df = prophet_df.merge(forecast_hist[['ds', 'yhat']], on='ds', how='left')

mape = mean_absolute_percentage_error(eval_df['y'], eval_df['yhat'])
mae = np.mean(np.abs(eval_df['y'] - eval_df['yhat']))
rmse = np.sqrt(np.mean((eval_df['y'] - eval_df['yhat'])**2))

print('\n' + '=' * 60)
print('MODEL EVALUATION METRICS')
print('=' * 60)
print(f'\nMAPE: {mape:.2f}%')
print(f'MAE: {mae:.2f} units')
print(f'RMSE: {rmse:.2f} units')
print(f'Average actual sales: {eval_df["y"].mean():.2f} units')
print(f'Average forecast: {eval_df["yhat"].mean():.2f} units')

In [None]:
# Actual vs Predicted (last 90 days)
last_90 = eval_df.tail(90)

fig, ax = plt.subplots(figsize=(14, 6))
ax.plot(last_90['ds'], last_90['y'], label='Actual', linewidth=2, color='blue')
ax.plot(last_90['ds'], last_90['yhat'], label='Forecast', linewidth=2, color='red', linestyle='--')
ax.fill_between(last_90['ds'], last_90['y'], last_90['yhat'], alpha=0.2, color='gray')
ax.set_title(f'Actual vs Predicted - Last 90 Days ({TOP_ITEM})', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Sales')
ax.legend(fontsize=11)
ax.grid(alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Residuals analysis
residuals = eval_df['y'] - eval_df['yhat']

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].plot(eval_df['ds'], residuals, linewidth=1, alpha=0.7, color='darkblue')
axes[0].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[0].set_title('Residuals Over Time', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Residual')
axes[0].grid(alpha=0.3)

axes[1].hist(residuals, bins=50, edgecolor='black', color='steelblue')
axes[1].axvline(x=0, color='red', linestyle='--', linewidth=2)
axes[1].set_title('Distribution of Residuals', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Residual')
axes[1].set_ylabel('Frequency')
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

## Future Forecast Summary

In [None]:
# Future forecast (next 30 days)
future_forecast = forecast[forecast['ds'] > prophet_df['ds'].max()][['ds', 'yhat', 'yhat_lower', 'yhat_upper']].copy()
future_forecast['yhat'] = future_forecast['yhat'].clip(lower=0)
future_forecast.columns = ['Date', 'Predicted_Sales', 'Lower_Bound_95', 'Upper_Bound_95']

print('\n' + '=' * 60)
print(f'30-DAY FORECAST - {TOP_ITEM}')
print('=' * 60)
print(future_forecast.to_string(index=False))

print(f'\nForecast Summary:')
print(f'  Total predicted (30 days): {future_forecast["Predicted_Sales"].sum():.0f} units')
print(f'  Average daily: {future_forecast["Predicted_Sales"].mean():.2f} units')
print(f'  Peak day: {future_forecast["Predicted_Sales"].max():.2f} units')
print(f'  Lowest day: {future_forecast["Predicted_Sales"].min():.2f} units')

## Train Multiple Products

In [None]:
def train_prophet_for_product(item_id, store_data, holidays, forecast_days=30):
    """
    Train Prophet model for a single product
    """
    item_data = store_data[store_data['item_id'] == item_id].copy()
    item_data = item_data.sort_values('date')
    
    prophet_input = item_data[['date', 'sales']].copy()
    prophet_input.columns = ['ds', 'y']
    prophet_input['ds'] = pd.to_datetime(prophet_input['ds'])
    prophet_input['y'] = prophet_input['y'].fillna(method='ffill').fillna(0)
    
    if len(prophet_input) < 30:
        return None
    
    try:
        model = Prophet(
            yearly_seasonality=True,
            weekly_seasonality=True,
            daily_seasonality=False,
            seasonality_mode='additive',
            interval_width=0.95,
            holidays=holidays
        )
        model.fit(prophet_input)
        
        future = model.make_future_dataframe(periods=forecast_days)
        forecast = model.predict(future)
        
        forecast_hist = forecast[forecast['ds'] <= prophet_input['ds'].max()].copy()
        mape = mean_absolute_percentage_error(prophet_input['y'], forecast_hist['yhat'])
        
        return {
            'item_id': item_id,
            'model': model,
            'forecast': forecast,
            'mape': mape,
            'observations': len(prophet_input)
        }
    except Exception as e:
        print(f'Error for {item_id}: {str(e)}')
        return None

print('Function defined: train_prophet_for_product()')

In [None]:
# Train for top 5 items
print('\n' + '=' * 60)
print('TRAINING TOP 5 PRODUCTS')
print('=' * 60)

models = {}

for item_id in top_5_items:
    print(f'Training: {item_id}...', end=' ')
    result = train_prophet_for_product(item_id, store_sales, holidays_df, forecast_days=30)
    
    if result:
        models[item_id] = result
        print(f'MAPE={result["mape"]:.2f}%')
    else:
        print('FAILED')

print(f'\nSuccessfully trained {len(models)} models')

In [None]:
# Summary table
print('\n' + '=' * 60)
print('TRAINED MODELS SUMMARY')
print('=' * 60)

summary = []
for item_id, result in models.items():
    summary.append({
        'Item ID': item_id,
        'MAPE (%)': f"{result['mape']:.2f}",
        'Observations': result['observations'],
        'Status': 'Ready'
    })

summary_df = pd.DataFrame(summary)
print(summary_df.to_string(index=False))

avg_mape = np.mean([r['mape'] for r in models.values()])
print(f'\nAverage MAPE: {avg_mape:.2f}%')

## Save Models

In [None]:
# Create models directory
MODELS_DIR = Path('../models')
MODELS_DIR.mkdir(exist_ok=True)

print('\n' + '=' * 60)
print('SAVING MODELS')
print('=' * 60)

for item_id, result in models.items():
    model_path = MODELS_DIR / f'prophet_{STORE_ID}_{item_id}.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(result['model'], f)
    print(f'Saved: {model_path.name}')

# Save metadata
metadata = {
    'store_id': STORE_ID,
    'trained_items': list(models.keys()),
    'model_count': len(models),
    'training_date': datetime.now().isoformat(),
    'dataset_info': {
        'total_records': len(store_sales),
        'total_items': store_sales['item_id'].nunique(),
        'date_range': f"{store_sales['date'].min()} to {store_sales['date'].max()}"
    }
}

metadata_path = MODELS_DIR / f'metadata_{STORE_ID}.json'
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2, default=str)

print(f'Saved: {metadata_path.name}')
print('\nAll models saved successfully!')

In [None]:
# Verify by loading a saved model
print('\nVerifying saved model...')
test_item = list(models.keys())[0]
test_path = MODELS_DIR / f'prophet_{STORE_ID}_{test_item}.pkl'

with open(test_path, 'rb') as f:
    loaded = pickle.load(f)

print(f'Successfully loaded: {test_item}')
print(f'Model type: {type(loaded)}')
print('Verification complete!')

## Summary

Completed:
1. Loaded M5 Walmart dataset from local path
2. EDA: 30K+ products, 1913 days, 10 stores
3. Focused on Store CA_1 (analysis of 3K+ products)
4. Trained Prophet model on top product with 30-day forecast
5. Evaluated with MAPE, MAE, RMSE metrics
6. Scaled to train 5 products
7. Saved models and metadata for production

Next Step: Use saved models in Django seed_demo.py