# Next-Month Sales Regression (per-product monthly forecast)
Predict next-month sales for each product using monthly aggregation, lag features, and Ridge regression.

In [1]:
# Setup cell
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

load_dotenv()
DB_URL = os.getenv("DB_URL")
if not DB_URL:
    raise RuntimeError("DB_URL missing in .env. Set DB_URL=postgresql+psycopg2://user:pass@host:port/dbname")

engine = create_engine(DB_URL, pool_pre_ping=True)
print("DB engine created.")

DB engine created.


In [2]:
# Pull facts joined to date and product (we only need order_date, product_id, product_key, category, sales)
SQL = """
SELECT
  f.product_id,
  p.product_key,
  p.category,
  d.full_date::date AS order_date,
  f.sales::float AS sales
FROM fact_sales f
JOIN dim_order o   ON f.order_id = o.order_id
JOIN dim_date d    ON o.order_date_id = d.date_id
LEFT JOIN dim_product p ON f.product_id = p.product_id
ORDER BY d.full_date;
"""

df = pd.read_sql(SQL, con=engine, parse_dates=["order_date"])
engine.dispose()
print("Rows loaded from DB:", len(df))
df.head(3)

Rows loaded from DB: 9994


Unnamed: 0,product_id,product_key,category,order_date,sales
0,1027,OFF-PA-10000174,Office Supplies,2014-01-03,16.448
1,988,OFF-LA-10003223,Office Supplies,2014-01-04,11.784
2,805,OFF-BI-10004094,Office Supplies,2014-01-04,3.54


In [3]:
# Aggregate sales to product-month level
df['month'] = df['order_date'].dt.to_period('M').dt.to_timestamp()  # month start date
monthly = (
    df.groupby(['product_id', 'product_key', 'category', 'month'], as_index=False)
      .agg(monthly_sales=('sales', 'sum'))
)

# Sort and ensure completeness per product-month
monthly = monthly.sort_values(['product_id','month']).reset_index(drop=True)
print("Monthly rows:", len(monthly))
monthly.head(6)

Monthly rows: 9294


Unnamed: 0,product_id,product_key,category,month,monthly_sales
0,1,FUR-BO-10000112,Furniture,2017-09-01,825.174
1,2,FUR-BO-10000330,Furniture,2014-11-01,411.332
2,2,FUR-BO-10000330,Furniture,2015-09-01,411.332
3,2,FUR-BO-10000330,Furniture,2017-05-01,241.96
4,3,FUR-BO-10000362,Furniture,2014-05-01,290.666
5,3,FUR-BO-10000362,Furniture,2014-11-01,1025.88


In [4]:
# Create lag features and rolling means per product
monthly = monthly.copy()
monthly['lag_1'] = monthly.groupby('product_id')['monthly_sales'].shift(1)
monthly['lag_2'] = monthly.groupby('product_id')['monthly_sales'].shift(2)
monthly['lag_3'] = monthly.groupby('product_id')['monthly_sales'].shift(3)
monthly['roll3_mean'] = monthly.groupby('product_id')['monthly_sales'].shift(1).rolling(window=3, min_periods=1).mean().reset_index(level=0, drop=True)

# Product-level historical mean (useful feature)
monthly['prod_mean_sales'] = monthly.groupby('product_id')['monthly_sales'].transform(lambda s: s.expanding().mean()).shift(1)

# Month-of-year numeric & categorical for seasonality
monthly['month_of_year'] = monthly['month'].dt.month.astype(int)

# Drop rows where we cannot construct lag_1 (these can't be used to predict next-month)
monthly_feat = monthly.dropna(subset=['lag_1']).copy()

# Build target: next_month_sales = monthly_sales at t+1
monthly_feat['next_month'] = monthly_feat.groupby('product_id')['monthly_sales'].shift(-1)

# We'll only train on rows where next_month is available (so we can evaluate on latest month separately)
data = monthly_feat.dropna(subset=['next_month']).copy()

print("Feature rows with target available:", len(data))
data.head(6)

Feature rows with target available: 5667


Unnamed: 0,product_id,product_key,category,month,monthly_sales,lag_1,lag_2,lag_3,roll3_mean,prod_mean_sales,month_of_year,next_month
2,2,FUR-BO-10000330,Furniture,2015-09-01,411.332,411.332,,,411.332,411.332,9,241.96
5,3,FUR-BO-10000362,Furniture,2014-11-01,1025.88,290.666,,,350.999,290.666,11,341.96
6,3,FUR-BO-10000362,Furniture,2014-12-01,341.96,1025.88,290.666,,658.273,658.273,12,359.058
7,3,FUR-BO-10000362,Furniture,2015-03-01,359.058,341.96,1025.88,290.666,552.835333,552.835333,3,136.784
10,4,FUR-BO-10000468,Furniture,2015-09-01,194.32,155.456,,,257.257,155.456,9,48.58
11,4,FUR-BO-10000468,Furniture,2016-01-01,48.58,194.32,155.456,,174.888,174.888,1,77.728


In [5]:
# We'll hold out the latest calendar month across the whole dataset as the test set.
max_month = data['month'].max()
test_month = max_month  # predict for this month (i.e., we trained on everything earlier)
train = data[data['month'] < test_month].copy()
test  = data[data['month'] == test_month].copy()

# Safety: if test set empty (rare), fallback to last available month with any rows
if test.empty:
    alt_month = data['month'].unique()[-1]
    test = data[data['month'] == alt_month].copy()
    train = data[data['month'] < alt_month].copy()

print("Train rows:", len(train), "Test rows:", len(test))

Train rows: 5583 Test rows: 84


In [6]:
# Features we will use
numeric_feats = ['lag_1','lag_2','lag_3','roll3_mean','prod_mean_sales']
categorical_feats = ['month_of_year','category']   # month (12 levels) and product category (few levels)

# Preprocessing
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value=0.0)),
    ('scaler', StandardScaler())
])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='[MISSING]')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preproc = ColumnTransformer([
    ('num', num_pipe, numeric_feats),
    ('cat', cat_pipe, categorical_feats)
], remainder='drop')

# Model: Ridge regression (regularized linear)
model = Pipeline([
    ('preproc', preproc),
    ('ridge', Ridge(alpha=1.0, random_state=42))
])

In [7]:
# Prepare X,y
X_train = train[numeric_feats + categorical_feats]
y_train = train['next_month'].astype(float)
X_test  = test[numeric_feats + categorical_feats]
y_test  = test['next_month'].astype(float)

print("Fitting Ridge regression...")
model.fit(X_train, y_train)
print("Model fitted.")

Fitting Ridge regression...
Model fitted.


In [8]:
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred) ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"Test rows: {len(y_test)}")
print(f"MAE: {mae:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R^2: {r2:.4f}")

# Quick comparison: add preds to test frame for inspection
test_out = test.copy()
test_out['pred_next_month'] = y_pred
test_out[['product_id','product_key','month','monthly_sales','next_month','pred_next_month']].head(10)

Test rows: 84
MAE: 106.0488
RMSE: 177.3151
R^2: 0.3276


Unnamed: 0,product_id,product_key,month,monthly_sales,next_month,pred_next_month
123,28,FUR-BO-10003159,2017-11-01,183.968,459.92,311.514462
140,32,FUR-BO-10003441,2017-11-01,205.9992,323.136,268.477585
396,77,FUR-CH-10001854,2017-11-01,701.96,701.96,1136.799962
622,115,FUR-CH-10003956,2017-11-01,283.92,113.568,279.110185
786,138,FUR-FU-10000023,2017-11-01,47.12,70.68,97.157175
859,153,FUR-FU-10000320,2017-11-01,24.048,13.36,83.849537
943,170,FUR-FU-10000820,2017-11-01,50.97,13.592,104.116199
1014,184,FUR-FU-10001473,2017-11-01,27.46,228.5,121.889836
1031,187,FUR-FU-10001488,2017-11-01,339.136,508.704,323.978495
1349,253,FUR-FU-10003268,2017-11-01,119.94,199.9,173.581913


In [9]:
os.makedirs("models", exist_ok=True)
model_path = "models/next_month_sales_ridge.joblib"
joblib.dump(model, model_path)
print("Model saved to", model_path)

Model saved to models/next_month_sales_ridge.joblib


In [10]:
# Example: predict next month's sales for the first few rows in test set using the pipeline
sample_X = test[numeric_feats + categorical_feats].head(5)
sample_preds = model.predict(sample_X)
sample = test[['product_id','product_key','month']].head(5).copy()
sample['pred_next_month'] = sample_preds
sample

Unnamed: 0,product_id,product_key,month,pred_next_month
123,28,FUR-BO-10003159,2017-11-01,311.514462
140,32,FUR-BO-10003441,2017-11-01,268.477585
396,77,FUR-CH-10001854,2017-11-01,1136.799962
622,115,FUR-CH-10003956,2017-11-01,279.110185
786,138,FUR-FU-10000023,2017-11-01,97.157175
