In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

target: str 
df: pd.DataFrame
clean_dir: Path 

clean_dir = Path("../data/clean")
target = 'Win'

In [None]:
def knn(df: pd.DataFrame):
    X = df.drop(columns=target)
    y = df[target]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.10,
        random_state=42,
        stratify=y  
    )

    poly = PolynomialFeatures(interaction_only=True, include_bias=False)
    X_train_inter = poly.fit_transform(X_train)
    X_test_inter = poly.transform(X_test)

    feature_names = poly.get_feature_names_out(X_train.columns)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_inter)
    X_test_scaled = scaler.transform(X_test_inter)


    knn = KNeighborsClassifier()

    param_grid = {
        "n_neighbors": [5, 10, 20, 50],
        "weights": ["uniform", "distance"],
        "metric": ["euclidean", "manhattan"]
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    grid = GridSearchCV(
        knn,
        param_grid,
        scoring="roc_auc",
        cv=cv,
        n_jobs=-1
    )

    grid.fit(X_train_scaled, y_train)

    print("Best CV ROC-AUC:", grid.best_score_)
    print("Best parameters:", grid.best_params_)

    best_knn = grid.best_estimator_
    test_probs = best_knn.predict_proba(X_test_scaled)[:, 1]
    test_auc = roc_auc_score(y_test, test_probs)

    print("Test ROC-AUC:", test_auc)

In [4]:
file_name = 'num_ARAM.csv'

df = pd.read_csv(clean_dir / file_name).set_index('#')

knn(df)

Best CV ROC-AUC: 0.9829588395729223
Best parameters: {'metric': 'euclidean', 'n_neighbors': 50, 'weights': 'distance'}
Test ROC-AUC: 0.9945594228216709


In [5]:
file_name = 'num_CHERRY.csv'

df = pd.read_csv(clean_dir / file_name).set_index('#')

knn(df)

Best CV ROC-AUC: 0.9942547033898487
Best parameters: {'metric': 'euclidean', 'n_neighbors': 50, 'weights': 'distance'}
Test ROC-AUC: 0.9978502442074854


In [6]:
file_name = 'num_CLASSIC.csv'

df = pd.read_csv(clean_dir / file_name).set_index('#')

knn(df)

Best CV ROC-AUC: 0.9744383252475426
Best parameters: {'metric': 'manhattan', 'n_neighbors': 20, 'weights': 'distance'}
Test ROC-AUC: 0.9848304144736671


In [7]:
file_name = 'num_SWIFTPLAY.csv'

df = pd.read_csv(clean_dir / file_name).set_index('#')

knn(df)

Best CV ROC-AUC: 0.9645138207354771
Best parameters: {'metric': 'manhattan', 'n_neighbors': 50, 'weights': 'distance'}
Test ROC-AUC: 0.9734563284301505


In [8]:
file_name = 'num_ULTBOOK.csv'

df = pd.read_csv(clean_dir / file_name).set_index('#')

knn(df)

Best CV ROC-AUC: 1.0
Best parameters: {'metric': 'euclidean', 'n_neighbors': 5, 'weights': 'distance'}
Test ROC-AUC: 1.0


Traceback (most recent call last):
  File "/Users/bardak/Documents/Python/LoL match outcome predictions/env/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 916, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bardak/Documents/Python/LoL match outcome predictions/env/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 317, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/bardak/Documents/Python/LoL match outcome predictions/env/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 409, in _score
    y_pred = method_caller(
             ^^^^^^^^^^^^^^
  File "/Users/bardak/Documents/Python/LoL match outcome predictions/env/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 96, in _ca