# Stockout Risk Prediction

This notebook builds a model to predict whether a SKU will
stock out tomorrow using historical sales, availability,
and promotion data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb


In [None]:
df = pd.read_csv("grocery_sales_autumn_2025.csv")
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values(["venue_id","sku_id", "date"])

df.head()


In [None]:
df["target_stockout"] = (
    df.groupby(["venue_id","sku_id"])["stockout_flag"]
      .shift(-1))


In [None]:
df = df.dropna(subset=["target_stockout"])
df["target_stockout"] = df["target_stockout"].astype(int)

df[["date", "venue_id", "sku_id", "stockout_flag", "target_stockout"]].head(10)

In [None]:
df["target_stockout"].value_counts(normalize=True)


In [None]:
df["in_stock_ratio"] = (
    df["in_stock_minutes"] / df["operating_minutes"])


In [None]:
df[["in_stock_minutes","operating_minutes", "in_stock_ratio"]].head(10)


In [None]:
df.groupby("target_stockout")["in_stock_ratio"].mean()

In [None]:
grp = df.groupby(["venue_id", "sku_id"])

df["sales_lag_1"]= grp["units_sold"].shift(1)
df["sales_7d_mean"] = grp["units_sold"].shift(1).rolling(7).mean()
df["sales_7d_max"] =grp["units_sold"].shift(1).rolling(7).max()


In [None]:
df = df.dropna(subset=["sales_lag_1", "sales_7d_mean", "sales_7d_max"]).copy()


In [None]:
df[["units_sold", "sales_lag_1","sales_7d_mean", "sales_7d_max"]].head(10)


In [None]:
df.groupby("target_stockout")["sales_7d_mean"].mean()


In [None]:
df["stockout_lag_1"] = grp["stockout_flag"].shift(1)
df["stockout_7d_sum"] = grp["stockout_flag"].shift(1).rolling(7).sum()


In [None]:
df = df.dropna(subset=["stockout_lag_1", "stockout_7d_sum"]).copy()


In [None]:
df.groupby("target_stockout")["stockout_7d_sum"].mean()


In [None]:
df["day_of_week"]= df["date"].dt.weekday
df["is_weekend"] =df["day_of_week"].isin([5, 6]).astype(int)



In [None]:
df["promo_depth"]= df["promo_depth"].fillna(0)


In [None]:
categorical_cols = [
    "phl1_id",
    "phl2_id",
    "phl3_id",
    "venue_id",
    "country_id",
]

for col in categorical_cols:
    df[col] = df[col].astype("category").cat.codes


In [None]:
df[categorical_cols].dtypes


In [None]:
split_date = df["date"].quantile(0.8)

train = df[df["date"] < split_date]
valid = df[df["date"] >= split_date]


In [None]:
features = [
    "in_stock_ratio",
    "sales_lag_1",
    "sales_7d_mean",
    "sales_7d_max",
    "stockout_lag_1",
    "stockout_7d_sum",
    "promo_flag",
    "promo_depth",
    "price",
    "day_of_week",
    "is_weekend",
    "phl1_id",
    "phl2_id",
    "phl3_id",
    "venue_id",
    "country_id",
]



In [None]:
X_train = train[features]
y_train = train["target_stockout"]

X_valid = valid[features]
y_valid = valid["target_stockout"]

X_train.shape, X_valid.shape


In [None]:


model = lgb.LGBMClassifier(
    objective="binary",
    n_estimators=300,
    learning_rate=0.05,
    class_weight="balanced",
    random_state=42,
)

model.fit(X_train, y_train);



In [None]:
valid = valid.copy()
valid["pred_proba"] = model.predict_proba(X_valid)[:, 1]

valid[["pred_proba", "target_stockout"]].head()



In [None]:
def recall_at_k(y_true, y_score, k=0.05):
    cutoff=int(len(y_score)*k)
    top_k_idx=y_score.sort_values(ascending=False).index[:cutoff]
    return y_true.loc[top_k_idx].sum()/y_true.sum()

recall_5pct =recall_at_k(y_valid, valid["pred_proba"], k=0.05)
recall_10pct = recall_at_k(y_valid, valid["pred_proba"], k=0.10)

recall_5pct, recall_10pct



In [None]:
baseline_score = valid["stockout_7d_sum"]

baseline_recall_5pct= recall_at_k(y_valid, baseline_score, k=0.05)
baseline_recall_10pct = recall_at_k(y_valid, baseline_score, k=0.10)

baseline_recall_5pct, baseline_recall_10pct

