In [17]:
import os
print(os.listdir('../Retrieval/Data_Combined'))


['feature_correlations_all.csv', '.DS_Store', 'dota_X_final 2.csv', 'combined_matches_players_all_1_enhanced.csv', 'dota_X_final_full.csv', 'dota_X_final_noleak.csv', 'dota_y_final.csv', 'dota_pro_combined.csv', 'dota_X_midgame.csv', 'dota_pro_combined_enhanced 2.csv', 'dota_X_final.csv', 'dota_X_final_noleak_scaled.csv', 'dota_X_midgame_scaled.csv', 'dota_pro_combined_enhanced.csv', 'combined_matches_players_all_1.csv']


In [18]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
)

# ---------- PATHS ----------
X_noleak_path         = '../Retrieval/Data_Combined/dota_X_final_noleak.csv'
X_noleak_scaled_path  = '../Retrieval/Data_Combined/dota_X_final_noleak_scaled.csv'
X_mid_path            = '../Retrieval/Data_Combined/dota_X_midgame.csv'
X_mid_scaled_path     = '../Retrieval/Data_Combined/dota_X_midgame_scaled.csv'
y_path                = '../Retrieval/Data_Combined/dota_y_final.csv'


In [19]:
# ---------- LOAD FEATURES AND LABELS ----------
X_noleak = pd.read_csv(X_noleak_path)
X_noleak_scaled = pd.read_csv(X_noleak_scaled_path)

X_mid = pd.read_csv(X_mid_path)
X_mid_scaled = pd.read_csv(X_mid_scaled_path)

y = pd.read_csv(y_path).squeeze()  # convert 1-col DataFrame to Series

print("X_noleak shape:       ", X_noleak.shape)
print("X_noleak_scaled shape:", X_noleak_scaled.shape)
print("X_mid shape:          ", X_mid.shape)
print("X_mid_scaled shape:   ", X_mid_scaled.shape)
print("y shape:              ", y.shape)

print("\nClass balance (radiant_win):")
print(y.value_counts(normalize=True))


X_noleak shape:        (1042190, 49)
X_noleak_scaled shape: (1042190, 49)
X_mid shape:           (1042190, 45)
X_mid_scaled shape:    (1042190, 45)
y shape:               (1042190,)

Class balance (radiant_win):
radiant_win
1    0.508161
0    0.491839
Name: proportion, dtype: float64


In [20]:
def evaluate_model(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    print(f"\n--------- {name} ---------")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")

    print("\nClassification report:")
    print(classification_report(y_true, y_pred, zero_division=0))

    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))


In [21]:
RANDOM_STATE = 42
TEST_SIZE = 0.2

print("\n-------------- NO-LEAK FEATURE SET EXPERIMENTS --------------------\n")

# For Random Forest: unscaled
X_train_rf, X_test_rf, y_train, y_test = train_test_split(
    X_noleak,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y,
)

# For Logistic Regression: scaled
X_train_lr, X_test_lr, _, _ = train_test_split(
    X_noleak_scaled,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y,
)

print("No-leak train size:", X_train_rf.shape[0])
print("No-leak test size: ", X_test_rf.shape[0])



-------------- NO-LEAK FEATURE SET EXPERIMENTS --------------------

No-leak train size: 833752
No-leak test size:  208438


In [22]:
log_reg = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    n_jobs=-1,
)

log_reg.fit(X_train_lr, y_train)
y_pred_lr = log_reg.predict(X_test_lr)

evaluate_model("Logistic Regression (NO-LEAK, scaled)", y_test, y_pred_lr)




--------- Logistic Regression (NO-LEAK, scaled) ---------
Accuracy : 0.9826
Precision: 0.9825
Recall   : 0.9833
F1-score : 0.9829

Classification report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    102518
           1       0.98      0.98      0.98    105920

    accuracy                           0.98    208438
   macro avg       0.98      0.98      0.98    208438
weighted avg       0.98      0.98      0.98    208438

Confusion matrix:
[[100664   1854]
 [  1771 104149]]


In [23]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

rf.fit(X_train_rf, y_train)
y_pred_rf = rf.predict(X_test_rf)

evaluate_model("Random Forest (NO-LEAK)", y_test, y_pred_rf)

# Feature importances for NO-LEAK set
importances = rf.feature_importances_
feat_names = X_noleak.columns

feat_imp = pd.DataFrame({
    "feature": feat_names,
    "importance": importances,
}).sort_values("importance", ascending=False)

print("\nTop 20 most important features (Random Forest, NO-LEAK):")
feat_imp.head(20)



--------- Random Forest (NO-LEAK) ---------
Accuracy : 0.9989
Precision: 0.9990
Recall   : 0.9989
F1-score : 0.9989

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    102518
           1       1.00      1.00      1.00    105920

    accuracy                           1.00    208438
   macro avg       1.00      1.00      1.00    208438
weighted avg       1.00      1.00      1.00    208438

Confusion matrix:
[[102407    111]
 [   117 105803]]

Top 20 most important features (Random Forest, NO-LEAK):


Unnamed: 0,feature,importance
41,barracks_status_radiant,0.207527
40,barracks_status_dire,0.190108
43,tower_status_radiant,0.183734
42,tower_status_dire,0.156075
45,radiant_score,0.089558
44,dire_score,0.086411
31,win,0.016526
36,duration,0.01297
1,player_slot,0.010328
15,level,0.004517


In [24]:
print("\n\n------------- MID-GAME FEATURE SET EXPERIMENTS -------------\n")

X_train_mid_rf, X_test_mid_rf, y_train_mid, y_test_mid = train_test_split(
    X_mid,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y,
)

X_train_mid_lr, X_test_mid_lr, _, _ = train_test_split(
    X_mid_scaled,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y,
)

print("Mid-game train size:", X_train_mid_rf.shape[0])
print("Mid-game test size: ", X_test_mid_rf.shape[0])




------------- MID-GAME FEATURE SET EXPERIMENTS -------------

Mid-game train size: 833752
Mid-game test size:  208438


In [25]:
log_reg_mid = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    n_jobs=-1,
)

log_reg_mid.fit(X_train_mid_lr, y_train_mid)
y_pred_mid_lr = log_reg_mid.predict(X_test_mid_lr)

evaluate_model("Logistic Regression (MID-GAME, scaled)", y_test_mid, y_pred_mid_lr)



--------- Logistic Regression (MID-GAME, scaled) ---------
Accuracy : 0.9826
Precision: 0.9825
Recall   : 0.9833
F1-score : 0.9829

Classification report:
              precision    recall  f1-score   support

           0       0.98      0.98      0.98    102518
           1       0.98      0.98      0.98    105920

    accuracy                           0.98    208438
   macro avg       0.98      0.98      0.98    208438
weighted avg       0.98      0.98      0.98    208438

Confusion matrix:
[[100668   1850]
 [  1774 104146]]


In [26]:
rf_mid = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

rf_mid.fit(X_train_mid_rf, y_train_mid)
y_pred_mid_rf = rf_mid.predict(X_test_mid_rf)

evaluate_model("Random Forest (MID-GAME)", y_test_mid, y_pred_mid_rf)



--------- Random Forest (MID-GAME) ---------
Accuracy : 0.9990
Precision: 0.9990
Recall   : 0.9990
F1-score : 0.9990

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    102518
           1       1.00      1.00      1.00    105920

    accuracy                           1.00    208438
   macro avg       1.00      1.00      1.00    208438
weighted avg       1.00      1.00      1.00    208438

Confusion matrix:
[[102410    108]
 [   106 105814]]
