In [None]:
import pandas as pd
import numpy as np

def build_rolling_classification_dataset(
    df: pd.DataFrame,
    window: int = 28,
    step: int = 7,                 # step=7 gives weekly sliding windows 
    horizon_window: int = 28,      # future window length for label
    increase_threshold: float = 0.05,  # 5% threshold for "increase" label (balanced)
    eps: float = 1e-6
):
    """
    Build supervised dataset for demand-direction classification (per Category).
    
    Target (binary):
      y=1 if mean_future > mean_current * (1 + increase_threshold)
      y=0 otherwise
    
    Returns:
      X: DataFrame of features
      y: Series target labels
      meta: DataFrame with CategoryName, window_start, window_end (for time-based split)
    """

    data = df.copy()
    data["OrderDate"] = pd.to_datetime(data["OrderDate"], format="%d-%b-%y", errors="coerce")
    data = data.dropna(subset=["OrderDate"])

    # Shipped only
    data = data[data["Status"].astype(str).str.lower() == "shipped"].copy()

    # numeric qty
    data["OrderItemQuantity"] = pd.to_numeric(data["OrderItemQuantity"], errors="coerce").fillna(0)

    if data.empty:
        raise ValueError("No shipped records after cleaning.")

    # daily category demand
    daily = (
        data.groupby(["OrderDate", "CategoryName"], as_index=False)["OrderItemQuantity"]
            .sum()
            .rename(columns={"OrderItemQuantity": "demand"})
    )

    min_date = daily["OrderDate"].min()
    max_date = daily["OrderDate"].max()
    full_dates = pd.date_range(min_date, max_date, freq="D")

    X_rows = []
    y_rows = []
    meta_rows = []

    categories = daily["CategoryName"].unique()

    for cat in categories:
        s = daily[daily["CategoryName"] == cat].set_index("OrderDate")[["demand"]]
        s = s.reindex(full_dates, fill_value=0).reset_index().rename(columns={"index": "OrderDate"})
        s["CategoryName"] = cat

        # Need current window + future window
        total_needed = window + horizon_window
        if len(s) < total_needed:
            continue

        # rolling start positions
        for start_i in range(0, len(s) - total_needed + 1, step):
            cur = s.iloc[start_i:start_i + window]["demand"]
            fut = s.iloc[start_i + window:start_i + window + horizon_window]["demand"]

            cur_mean = cur.mean()
            fut_mean = fut.mean()

            # label with threshold
            y = 1 if fut_mean > cur_mean * (1 + increase_threshold) else 0

            # features (current window)
            cur_sum = cur.sum()
            cur_std = cur.std(ddof=0)
            coverage = (cur > 0).mean()
            volatility_ratio = cur_std / (cur_mean + eps)

            # simple trend inside current window: compare first half vs second half
            half = window // 2
            first_half_mean = cur.iloc[:half].mean()
            second_half_mean = cur.iloc[half:].mean()
            intra_growth = (second_half_mean - first_half_mean) / (first_half_mean + eps)

            # future activity indicator (not leaking target, but helps handle all-zero futures)
            # (We keep it OUT to avoid leakage. So we do NOT include future stats as features.)

            X_rows.append({
                "CategoryName": cat,
                "cur_mean": float(cur_mean),
                "cur_sum": float(cur_sum),
                "cur_std": float(cur_std),
                "coverage": float(coverage),
                "volatility_ratio": float(volatility_ratio),
                "intra_growth": float(intra_growth),
            })

            y_rows.append(y)

            meta_rows.append({
                "CategoryName": cat,
                "window_start": s.iloc[start_i]["OrderDate"],
                "window_end": s.iloc[start_i + window - 1]["OrderDate"],
                "future_start": s.iloc[start_i + window]["OrderDate"],
                "future_end": s.iloc[start_i + total_needed - 1]["OrderDate"]
            })

    X = pd.DataFrame(X_rows)
    y = pd.Series(y_rows, name="target_up")
    meta = pd.DataFrame(meta_rows)

    # Remove CategoryName from X if you want pure numeric features
    # (Keep it if you plan one-hot encoding)
    return X, y, meta

In [32]:
df = pd.read_csv("ML-Dataset.csv")
X, y, meta = build_rolling_classification_dataset(df, window=28, step=7)

# Fix extreme values before modeling
X["intra_growth"] = X["intra_growth"].clip(-5, 5)

In [33]:
print("Samples:", len(X))
print("Positive rate:", y.mean())
meta["CategoryName"].value_counts()

Samples: 1100
Positive rate: 0.20272727272727273


CategoryName
Storage         220
CPU             220
Video Card      220
Mother Board    220
RAM             220
Name: count, dtype: int64

In [34]:
meta["CategoryName"].value_counts()

CategoryName
Storage         220
CPU             220
Video Card      220
Mother Board    220
RAM             220
Name: count, dtype: int64

In [25]:
y.value_counts(normalize=True)

target_up
0    0.797273
1    0.202727
Name: proportion, dtype: float64

In [35]:
X.describe()

Unnamed: 0,cur_mean,cur_sum,cur_std,coverage,volatility_ratio,intra_growth
count,1100.0,1100.0,1100.0,1100.0,1100.0,1100.0
mean,2.046396,57.299091,7.922098,0.01763,1.229078,0.44662
std,4.394011,123.0323,15.448394,0.033707,2.028034,1.623017
min,0.0,0.0,0.0,0.0,0.0,-1.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,2.178571,61.0,11.320189,0.035714,3.062738,0.0
max,31.071429,870.0,95.288258,0.178571,5.196152,5.0


In [36]:
X.select_dtypes(include=[np.number]).corr()

Unnamed: 0,cur_mean,cur_sum,cur_std,coverage,volatility_ratio,intra_growth
cur_mean,1.0,1.0,0.96888,0.914236,0.607307,0.267345
cur_sum,1.0,1.0,0.96888,0.914236,0.607307,0.267345
cur_std,0.96888,0.96888,1.0,0.865459,0.72631,0.337135
coverage,0.914236,0.914236,0.865459,1.0,0.686334,0.281807
volatility_ratio,0.607307,0.607307,0.72631,0.686334,1.0,0.469563
intra_growth,0.267345,0.267345,0.337135,0.281807,0.469563,1.0


In [37]:
X = X.drop(columns=["CategoryName"])

In [38]:
full_dataset = pd.concat([meta, X, y], axis=1)
full_dataset.to_csv("rolling_supervised_dataset.csv", index=False)