In [26]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd
from pathlib import Path

target: str 
df: pd.DataFrame
clean_dir: Path 

clean_dir = Path("../data/clean")
target = 'Win'

In [None]:
def log_reg(df: pd.DataFrame, C=0.01):

    X = df.drop(columns=target)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.10,
        random_state=42,
        stratify=y  
    )

    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    X_train_inter = poly.fit_transform(X_train)
    X_test_inter = poly.transform(X_test)

    feature_names = poly.get_feature_names_out(X_train.columns)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_inter)
    X_test_scaled = scaler.transform(X_test_inter)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []

    for train_idx, val_idx in cv.split(X_train_scaled, y_train):

        X_tr, X_val = X_train_scaled[train_idx], X_train_scaled[val_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

        # ---- Stage 1: L1 for discovery ----
        l1 = LogisticRegression(
            penalty="l1",
            solver="liblinear",
            C=0.5,
            max_iter=1000
        )
        l1.fit(X_tr, y_tr)

        nonzero_idx = np.where(l1.coef_[0] != 0)[0]
        selected = set(feature_names[nonzero_idx])

        # ---- Enforce hierarchy ----
        hierarchical_features = set(selected)

        for f in selected:
            if " " in f:  # interaction term
                a, b = f.split(" ")
                hierarchical_features.add(a)
                hierarchical_features.add(b)

        hierarchical_features = sorted(hierarchical_features)

        # ---- Reduce feature matrices ----
        keep_idx = [
            i for i, f in enumerate(feature_names)
            if f in hierarchical_features
        ]

        X_tr_final = X_tr[:, keep_idx]
        X_val_final = X_val[:, keep_idx]

        # ---- Stage 2: Refit with L2 for stability ----
        final_model = LogisticRegression(
            penalty="l2",
            max_iter=1000
        )
        final_model.fit(X_tr_final, y_tr)

        # ---- Evaluate on validation fold ----
        val_pred = final_model.predict_proba(X_val_final)[:, 1]
        score = roc_auc_score(y_val, val_pred)
        cv_scores.append(score)

    # 4. CV results
    print("CV ROC-AUC scores:", cv_scores)
    print("Mean CV ROC-AUC:", np.mean(cv_scores))
    print("Std CV ROC-AUC:", np.std(cv_scores))

    # ----------------------------
    # Final model on full training set
    # ----------------------------

    # Repeat selection on ALL training data
    l1.fit(X_train_scaled, y_train)
    nonzero_idx = np.where(l1.coef_[0] != 0)[0]
    selected = set(feature_names[nonzero_idx])

    hierarchical_features = set(selected)
    for f in selected:
        if " " in f:
            a, b = f.split(" ")
            hierarchical_features.add(a)
            hierarchical_features.add(b)

    hierarchical_features = sorted(hierarchical_features)

    keep_idx = [
        i for i, f in enumerate(feature_names)
        if f in hierarchical_features
    ]

    X_train_final = X_train_scaled[:, keep_idx]
    X_test_final = X_test_scaled[:, keep_idx]

    final_model.fit(X_train_final, y_train)

    # Test evaluation
    test_pred = final_model.predict_proba(X_test_final)[:, 1]
    test_auc = roc_auc_score(y_test, test_pred)
    print("Test ROC-AUC:", test_auc)

    # Coefficients for interpretation
    coef_df = pd.DataFrame({
        "feature": hierarchical_features,
        "coefficient": final_model.coef_[0]
    }).sort_values(by="coefficient", key=abs, ascending=False)

    print(coef_df)


Solely using the L1 normalization leads to breaking of the hierarchy, thus it has to be forced back in. After this I need to refit the model with L2.

In [21]:
file_name = 'num_ARAM.csv'

df = pd.read_csv(clean_dir / file_name).set_index('#')

log_reg(df)



CV ROC-AUC scores: [0.8678853351888286, 0.8615829013056834, 0.8610539074500767, 0.8688393798687399, 0.8677886903079618]
Mean CV ROC-AUC: 0.8654300428242581
Std CV ROC-AUC: 0.003381310879167405




Test ROC-AUC: 0.848493408516551
                           feature  coefficient
24                         assists     4.952500
7        DmgTaken TurretDmgDealt>1     3.093766
5                         DmgTaken    -2.779520
4                   DmgDealt kills     2.449119
25        assists TurretDmgDealt>1    -1.976454
6               DmgTaken TotalGold     1.938718
26                          deaths    -1.446302
3                 DmgDealt assists    -1.236210
23                TurretDmgDealt>1    -1.218879
27         deaths TurretDmgDealt>1    -1.148290
10                  DmgTaken kills     1.133511
11                   MinionsKilled    -1.026434
0                         DmgDealt    -0.633472
32                    kills deaths    -0.458129
13          MinionsKilled DmgTaken     0.405870
20               TotalGold assists    -0.402444
12          MinionsKilled DmgDealt     0.401405
1                DmgDealt DmgTaken    -0.384596
31                   kills assists    -0.341378
16      

In [22]:
file_name = 'num_CHERRY.csv'

df = pd.read_csv(clean_dir / file_name).set_index('#')

log_reg(df)



CV ROC-AUC scores: [0.9618195284813599, 0.9613795929041348, 0.9647703070007281, 0.9626676676861086, 0.9627126398716245]
Mean CV ROC-AUC: 0.9626699471887912
Std CV ROC-AUC: 0.001166324599653104
Test ROC-AUC: 0.9613844062343619
                    feature  coefficient
6          DmgTaken assists     2.232543
3           DmgDealt deaths     1.731501
4            DmgDealt kills     1.727140
1         DmgDealt DmgTaken     1.381732
15                TotalGold    -0.790467
21           deaths assists     0.781590
16        TotalGold assists    -0.706399
24             kills deaths    -0.661096
19                  assists     0.577684
8            DmgTaken kills    -0.558101
18          TotalGold kills    -0.524743
2          DmgDealt assists     0.501600
22                    kills    -0.417098
14      MinionsKilled kills    -0.391262
13     MinionsKilled deaths    -0.350428
20                   deaths     0.292493
0                  DmgDealt     0.236364
17         TotalGold deaths     0.22



In [23]:
file_name = 'num_CLASSIC.csv'

df = pd.read_csv(clean_dir / file_name).set_index('#')

log_reg(df)



CV ROC-AUC scores: [0.9165310098048753, 0.9182198668455044, 0.9169037647142488, 0.917656968717347, 0.9166212326755289]
Mean CV ROC-AUC: 0.9171865685515008
Std CV ROC-AUC: 0.0006508950166841679




Test ROC-AUC: 0.9147892322944919
                           feature  coefficient
35      TotalGold TurretDmgDealt>3     3.134856
5             DmgDealt DragonKills    -2.836561
4                DmgDealt DmgTaken     2.808456
6               DmgDealt TotalGold     2.578118
9                  DmgDealt deaths     2.463178
36               TotalGold assists    -1.754702
34           TotalGold DragonKills    -1.496961
39                TurretDmgDealt>3    -1.419473
2                         DmgDealt    -1.344507
40                         assists    -1.277658
12             DmgTaken BaronKills    -1.258412
0                       BaronKills     1.140900
8                 DmgDealt assists     0.980723
7        DmgDealt TurretDmgDealt>3     0.976006
20          DragonKills BaronKills     0.766834
14              DmgTaken TotalGold    -0.732326
1      BaronKills TurretDmgDealt>3    -0.692160
11                        DmgTaken     0.624691
29           MinionsKilled assists     0.574535
3      

In [24]:
file_name = 'num_SWIFTPLAY.csv'

df = pd.read_csv(clean_dir / file_name).set_index('#')

log_reg(df)



CV ROC-AUC scores: [0.8854748746815679, 0.8791755690689457, 0.874982640764115, 0.871431878987446, 0.8720724428894833]
Mean CV ROC-AUC: 0.8766274812783117
Std CV ROC-AUC: 0.0052003322898917185
Test ROC-AUC: 0.8423003407296601
                           feature  coefficient
4                   DmgDealt kills     2.567539
5                         DmgTaken    -2.543905
22                         assists     2.326475
7                  DmgTaken deaths     1.896406
6        DmgTaken TurretDmgDealt>2     1.867557
23        assists TurretDmgDealt>2    -1.646982
1                DmgDealt DmgTaken    -1.292330
25         deaths TurretDmgDealt>2    -0.794572
14            MinionsKilled deaths     0.770099
21                TurretDmgDealt>2    -0.729924
2        DmgDealt TurretDmgDealt>2    -0.699203
18               TotalGold assists    -0.644604
16                       TotalGold     0.597867
10          MinionsKilled DmgTaken    -0.591309
3                  DmgDealt deaths     0.573019
24     



In [25]:
file_name = 'num_ULTBOOK.csv'

df = pd.read_csv(clean_dir / file_name).set_index('#')

log_reg(df, 1)



CV ROC-AUC scores: [1.0, 1.0, 1.0, 1.0, 1.0]
Mean CV ROC-AUC: 1.0
Std CV ROC-AUC: 0.0
Test ROC-AUC: 1.0
                          feature  coefficient
10                        assists     0.754109
3                        DmgTaken    -0.672395
4                DmgTaken assists     0.660374
12                         deaths     0.614429
9                TurretDmgDealt>2    -0.554916
8            MinionsKilled deaths    -0.516273
11           assists GameDuration    -0.444043
13    deaths CurrentMasteryPoints     0.384159
7                   MinionsKilled     0.372384
1                        DmgDealt    -0.312692
6   GameDuration TurretDmgDealt>2     0.243774
0            CurrentMasteryPoints    -0.117538
5                    GameDuration    -0.062192
2                 DmgDealt deaths    -0.031525


