In [1]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import average_precision_score, roc_auc_score, precision_recall_curve
from sklearn.impute import SimpleImputer


import xgboost as xgb


# Make sure we're running from the repo root, not notebooks/
repo_root = Path.cwd().parent.parent
# os.chdir(repo_root)
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

from common.config_manager import ConfigManager
from common.io import read_csv


In [2]:
cfg = ConfigManager(repo_root)
project_cfg = cfg.project()
breakout_cfg = cfg.breakout()

paths = project_cfg["paths"]
features_dir = repo_root / paths["features"]
modeling_path = features_dir / project_cfg["breakout"]["modeling_filename"]

df = pd.read_csv(modeling_path)
df["month"] = pd.to_datetime(df["month"], errors="raise")

ID_COLS = ["artist_name", "month"]
DATE_COL = "month"
TARGET_COL = "y"
SPLIT_COL = "split"
CAT_COLS = ["genre_bucket"] if "genre_bucket" in df.columns else []

months = np.array(sorted(df[DATE_COL].unique()))
n_months = len(months)

n_train = int(n_months * 0.60)
n_val = int(n_months * 0.20)
n_test = n_months - n_train - n_val

train_months = set(months[:n_train])
val_months = set(months[n_train:n_train + n_val])
test_months = set(months[n_train + n_val:])

df[SPLIT_COL] = np.where(
    df[DATE_COL].isin(train_months),
    "train",
    np.where(df[DATE_COL].isin(val_months), "val", "test"),
)

train_df = df.loc[df[SPLIT_COL] == "train"].copy()
val_df = df.loc[df[SPLIT_COL] == "val"].copy()
test_df = df.loc[df[SPLIT_COL] == "test"].copy()

In [5]:
def evaluate_probs(name: str, y_true: np.ndarray, y_proba: np.ndarray) -> dict:
    out = {
        "pr_auc": float(average_precision_score(y_true, y_proba)),
        "roc_auc": float(roc_auc_score(y_true, y_proba)),
        "pos_rate": float(np.mean(y_true)) if len(y_true) else 0.0,
        "n": int(len(y_true)),
        "n_pos": int(np.sum(y_true)),
    }
    print(
        f"{name:<5} | n={out['n']:,} | pos={out['n_pos']:,} ({out['pos_rate']:.3%}) "
        f"| PR-AUC={out['pr_auc']:.4f} | ROC-AUC={out['roc_auc']:.4f}"
    )
    return out


excluded = set(ID_COLS + [TARGET_COL, SPLIT_COL] + CAT_COLS)
NUM_COLS = [c for c in df.columns if c not in excluded]

num_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

cat_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ohe", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocessor_lr = ColumnTransformer(
    transformers=[
        ("num", num_transformer, NUM_COLS),
        ("cat", cat_transformer, CAT_COLS),
    ],
    remainder="drop",
)

lr_pipe = Pipeline(
    steps=[
        ("prep", preprocessor_lr),
        ("model", LogisticRegression(class_weight="balanced", max_iter=2000)),
    ]
)

In [6]:
X_train = train_df[NUM_COLS + CAT_COLS]
y_train = train_df[TARGET_COL].astype(int).to_numpy()

X_val = val_df[NUM_COLS + CAT_COLS]
y_val = val_df[TARGET_COL].astype(int).to_numpy()

lr_pipe.fit(X_train, y_train)

p_train = lr_pipe.predict_proba(X_train)[:, 1]
p_val = lr_pipe.predict_proba(X_val)[:, 1]

lr_train_metrics = evaluate_probs("train", y_train, p_train)
lr_val_metrics = evaluate_probs("val", y_val, p_val)


train | n=1,413 | pos=16 (1.132%) | PR-AUC=0.1544 | ROC-AUC=0.8849
val   | n=544 | pos=4 (0.735%) | PR-AUC=0.0313 | ROC-AUC=0.8222


In [7]:
C_GRID = [0.01, 0.1, 1.0, 10.0]

lr_tuning_rows = []
for C in C_GRID:
    lr_pipe.set_params(model__C=C)
    lr_pipe.fit(X_train, y_train)

    p_val = lr_pipe.predict_proba(X_val)[:, 1]
    row = {
        "C": C,
        "pr_auc_val": float(average_precision_score(y_val, p_val)),
        "roc_auc_val": float(roc_auc_score(y_val, p_val)),
    }
    lr_tuning_rows.append(row)

lr_tuning = pd.DataFrame(lr_tuning_rows).sort_values("pr_auc_val", ascending=False).reset_index(drop=True)
lr_tuning


Unnamed: 0,C,pr_auc_val,roc_auc_val
0,10.0,0.032528,0.830093
1,1.0,0.031326,0.822222
2,0.1,0.02861,0.806019
3,0.01,0.027006,0.773148


In [8]:
best_C = float(lr_tuning.loc[0, "C"])
print(f"best_C (by val PR-AUC): {best_C}")

lr_pipe.set_params(model__C=best_C)
lr_pipe.fit(X_train, y_train)

p_val = lr_pipe.predict_proba(X_val)[:, 1]
evaluate_probs("val", y_val, p_val)


best_C (by val PR-AUC): 10.0
val   | n=544 | pos=4 (0.735%) | PR-AUC=0.0325 | ROC-AUC=0.8301


{'pr_auc': 0.03252830517481745,
 'roc_auc': 0.8300925925925925,
 'pos_rate': 0.007352941176470588,
 'n': 544,
 'n_pos': 4}

In [9]:
beta = 0.5

precision, recall, thresholds = precision_recall_curve(y_val, p_val)
thresholds = np.concatenate([thresholds, [1.0]])  # align lengths

f_beta = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall + 1e-12)
best_idx = int(np.argmax(f_beta))
T = float(thresholds[best_idx])

print(f"Chosen threshold T (max F{beta} on val): {T:.6f}")
print(f"Val precision={precision[best_idx]:.3f} | recall={recall[best_idx]:.3f} | F{beta}={f_beta[best_idx]:.3f}")


Chosen threshold T (max F0.5 on val): 0.455251
Val precision=0.053 | recall=0.500 | F0.5=0.064


In [13]:
lr_best_C = best_C
lr_T = T

trainval_df = df.loc[df[SPLIT_COL].isin(["train", "val"])].copy()

X_trainval = trainval_df[NUM_COLS + CAT_COLS]
y_trainval = trainval_df[TARGET_COL].astype(int).to_numpy()

lr_pipe.set_params(model__C=lr_best_C)
lr_pipe.fit(X_trainval, y_trainval)

p_val = lr_pipe.predict_proba(X_val)[:, 1]
lr_val_final_metrics = evaluate_probs("val", y_val, p_val)


val   | n=544 | pos=4 (0.735%) | PR-AUC=0.0610 | ROC-AUC=0.9310


In [14]:
lr_val_at_T = metrics_at_threshold(y_val, p_val, lr_T, beta=0.5)
print(lr_val_at_T)

val_alerts = (
    val_df.assign(p=p_val, pred=(p_val >= lr_T).astype(int))
          .groupby(DATE_COL)["pred"]
          .sum()
)

print(
    "Val alerts/month (min/median/max): "
    f"{int(val_alerts.min()):,} / {int(val_alerts.median()):,} / {int(val_alerts.max()):,}"
)


{'t': 0.45525077390754887, 'tp': 4, 'fp': 65, 'fn': 0, 'precision': 0.057971014492753624, 'recall': 1.0, 'f1': 0.1095890410958904, 'f0.5': 0.07142857142850101}
Val alerts/month (min/median/max): 3 / 5 / 12


In [15]:
rank_val = precision_at_top_pct_by_month(val_df, p_val, pct=0.05)
print(f"Val months included (>=1 positive): {len(rank_val):,} / {val_df[DATE_COL].nunique():,}")
rank_val.head(12)


Val months included (>=1 positive): 4 / 12


Unnamed: 0,month,n,k,pos_in_month,precision_top_pct
0,2023-10-01 00:00:00+00:00,45,3,1,0.333333
1,2023-11-01 00:00:00+00:00,46,3,1,0.0
2,2024-05-01 00:00:00+00:00,44,3,1,0.333333
3,2024-06-01 00:00:00+00:00,46,3,1,0.333333
