In [8]:
'''import os
print("Current working directory:")
print(os.getcwd())
'''

Current working directory:
C:\Users\ryan\Documents\GitHub\ML-Project\Models


In [9]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
)

# ---------- PATHS ----------
X_noleak_path        = '../Retrieval/Data_Combined/dota_X_final_noleak.csv'
X_noleak_scaled_path = '../Retrieval/Data_Combined/dota_X_final_noleak_scaled.csv'
y_path               = '../Retrieval/Data_Combined/dota_y_final.csv'


In [10]:
# Load features and labels
X_noleak = pd.read_csv(X_noleak_path)
X_noleak_scaled = pd.read_csv(X_noleak_scaled_path)
y = pd.read_csv(y_path).squeeze()  # convert 1-col DataFrame to Series

print("X_noleak shape:       ", X_noleak.shape)
print("X_noleak_scaled shape:", X_noleak_scaled.shape)
print("y shape:              ", y.shape)

print("\nClass balance (radiant_win):")
print(y.value_counts(normalize=True))


X_noleak shape:        (15000, 94)
X_noleak_scaled shape: (15000, 94)
y shape:               (15000,)

Class balance (radiant_win):
radiant_win
1    0.5178
0    0.4822
Name: proportion, dtype: float64


In [11]:
RANDOM_STATE = 42
TEST_SIZE = 0.2  # 80% train, 20% test

# For Random Forest: use unscaled
X_train_rf, X_test_rf, y_train, y_test = train_test_split(
    X_noleak,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y,
)

# For Logistic Regression: use scaled
X_train_lr, X_test_lr, _, _ = train_test_split(
    X_noleak_scaled,
    y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y,
)

print("Train size:", X_train_rf.shape[0])
print("Test size: ", X_test_rf.shape[0])


Train size: 12000
Test size:  3000


In [12]:
def evaluate_model(name, y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    print(f"\n===== {name} =====")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")

    print("\nClassification report:")
    print(classification_report(y_true, y_pred, zero_division=0))

    print("Confusion matrix:")
    print(confusion_matrix(y_true, y_pred))


In [13]:
log_reg = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',        # good default
    n_jobs=-1,             # use all cores if available
)

log_reg.fit(X_train_lr, y_train)
y_pred_lr = log_reg.predict(X_test_lr)

evaluate_model("Logistic Regression (no-leak, scaled)", y_test, y_pred_lr)



===== Logistic Regression (no-leak, scaled) =====
Accuracy : 0.9960
Precision: 0.9968
Recall   : 0.9955
F1-score : 0.9961

Classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1447
           1       1.00      1.00      1.00      1553

    accuracy                           1.00      3000
   macro avg       1.00      1.00      1.00      3000
weighted avg       1.00      1.00      1.00      3000

Confusion matrix:
[[1442    5]
 [   7 1546]]


In [14]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)

rf.fit(X_train_rf, y_train)
y_pred_rf = rf.predict(X_test_rf)

evaluate_model("Random Forest (no-leak)", y_test, y_pred_rf)



===== Random Forest (no-leak) =====
Accuracy : 0.9933
Precision: 0.9955
Recall   : 0.9916
F1-score : 0.9935

Classification report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1447
           1       1.00      0.99      0.99      1553

    accuracy                           0.99      3000
   macro avg       0.99      0.99      0.99      3000
weighted avg       0.99      0.99      0.99      3000

Confusion matrix:
[[1440    7]
 [  13 1540]]


In [15]:
importances = rf.feature_importances_
feat_names = X_noleak.columns

feat_imp = pd.DataFrame({
    "feature": feat_names,
    "importance": importances,
}).sort_values("importance", ascending=False)

print("Top 20 most important features (Random Forest):")
feat_imp.head(20)


Top 20 most important features (Random Forest):


Unnamed: 0,feature,importance
59,xpm_ratio,0.12491
93,tower_damage_diff,0.108203
58,gpm_ratio,0.103635
68,gold_advantage_per_min,0.080319
55,kd_ratio_radiant,0.05743
56,kd_ratio_dire,0.057345
57,kd_advantage,0.052232
76,level_advantage,0.052048
88,assist_diff,0.03625
65,combat_effectiveness_radiant,0.035908
