# csv format:
(.venv) (base) ➜  machine-learning git:(main) ✗ head data/comp-games.csv 
id,patch_version_substring,region_name,blue_team_name,red_team_name,blue_team_won,schedule_start_time,riot_match_id,blue_top_name,blue_jungle_name,blue_mid_name,blue_bot_name,blue_support_name,red_top_name,red_jungle_name,red_mid_name,red_bot_name,red_support_name
5589,15.1,LPL,Weibo Gaming,OMG,1,2025-01-12T08:00:00.000Z,1.13663E+17,KSante,Viego,Aurora,Ashe,Braum,Gnar,Nocturne,Orianna,Varus,Neeko
5590,15.1,LPL,OMG,Weibo Gaming,0,2025-01-12T09:00:00.000Z,1.13663E+17,Rumble,Maokai,Ambessa,Jhin,Rell,Jayce,MonkeyKing,Viktor,MissFortune,Rakan
5591,15.1,LPL,OMG,Weibo Gaming,0,2025-01-12T10:00:00.000Z,1.13663E+17,Aatrox,Zyra,Yone,Ziggs,Leona,Jax,Sejuani,Sylas,Jinx,Poppy
5592,15.1,LPL,LNG Esports,TT Gaming,1,2025-01-13T08:00:00.000Z,113662725452970050,KSante,Pantheon,Vladimir,Kalista,Neeko,Rumble,Vi,Aurora,Ezreal,Leona
5593,15.1,LPL,TT Gaming,LNG Esports,1,2025-01-13T09:00:00.000Z,113662725452970050,Gnar,MonkeyKing,Akali,Kaisa,Rakan,Renekton,LeeSin,Taliyah,Corki,Rell
5594,15.1,LPL,LNG Esports,TT Gaming,0,2025-01-13T10:00:00.000Z,113662725452970050,Jax,Volibear,Yone,Ashe,Braum,Gragas,XinZhao,Viktor,Varus,Karma
5595,15.1,LPL,LNG Esports,TT Gaming,0,2025-01-13T11:00:00.000Z,113662725452970050,Ambessa,Sejuani,Sylas,Twitch,Lulu,Poppy,Viego,Azir,Jinx,Blitzcrank
5596,15.1,LPL,Royal Never Give Up,FunPlus Phoenix,0,2025-01-14T08:00:00.000Z,113662725453101140,Jax,Viego,Orianna,Ashe,Braum,KSante,Vi,Aurora,Kalista,Neeko
5597,15.1,LPL,Royal Never Give Up,FunPlus Phoenix,1,2025-01-14T09:00:00.000Z,113662725453101140,Ambessa,LeeSin,Akali,Varus,Nautilus,Jayce,MonkeyKing,Sylas,MissFortune,Rakan

In [8]:
import pandas as pd
import numpy as np
import requests
import json
from sklearn.metrics import log_loss, accuracy_score
from typing import List, Dict, Optional, Tuple
import re
from difflib import get_close_matches

from utils.match_prediction.champions import Champion

# Constants
MODEL_URL = "http://localhost:8000/predict-batch"
BATCH_SIZE = 2048  # Adjust based on your server's MAX_BATCH_SIZE


def format_patch_version(patch: str) -> str:
    """Convert patch version to have 2 significant digits after decimal."""
    # Handle cases where there's no decimal point
    if "." not in patch:
        print(f"WARNING: Invalid patch format found: {patch}")
        return patch  # Return original value if we can't parse it

    major, minor = patch.split(".")
    return f"{major}.{int(minor):02d}"


# Update the RELEVANT_PATCHES with formatted versions
# RELEVANT_PATCHES = ["15.03", "15.04", "15.05"]  # Changed from ["15.3", "15.4", "15.5"]
RELEVANT_PATCHES = [
    "15.01",
    "15.02",
    "15.03",
    "15.04",
    "15.05",  # Patches from 15.01 to 15.05
    "14.07",
    "14.08",
    "14.09",
    "14.10",
    "14.11",
    "14.12",
    "14.13",
    "14.14",
    "14.15",
    "14.16",
    "14.17",
    "14.18",
    "14.19",
    "14.20",
    "14.21",
    "14.22",
    "14.23",
    "14.24",  # Patches from 14.07 to 14.24
]
OUTPUT_FILE = "../data/comp-games-with-predictions.csv"

# Manual name mappings for special cases
NAME_OVERRIDES = {
    "MonkeyKing": "Wukong",
    "Nunu": "Nunu & Willump",
}


def load_champion_mapping() -> Dict[str, int]:
    """Create a mapping from champion name to ID with alternative names."""
    champion_mapping = {}

    # Create regular mapping
    for champion in Champion:
        # Lowercase for case-insensitive matching
        name = champion.display_name.lower()
        champion_mapping[name] = champion.id

        # Add name without punctuation and spaces
        clean_name = re.sub(r"[^\w]", "", name)
        champion_mapping[clean_name] = champion.id

    # Add special case mappings
    for alt_name, actual_name in NAME_OVERRIDES.items():
        for champion in Champion:
            if champion.display_name.lower() == actual_name.lower():
                champion_mapping[alt_name.lower()] = champion.id
                break

    return champion_mapping


def match_champion_name(name: str, mapping: Dict[str, int]) -> Optional[int]:
    """Match champion name to ID using fuzzy matching if necessary."""
    name_lower = name.lower()
    clean_name = re.sub(r"[^\w]", "", name_lower)

    # Direct match
    if name_lower in mapping:
        return mapping[name_lower]

    # Clean name match
    if clean_name in mapping:
        return mapping[clean_name]

    # Fuzzy matching
    possible_matches = get_close_matches(name_lower, mapping.keys(), n=1, cutoff=0.7)
    if possible_matches:
        return mapping[possible_matches[0]]

    print(f"WARNING: Could not match champion name: {name}")
    return None


def prepare_api_input(
    row: pd.Series, champion_mapping: Dict[str, int]
) -> Optional[Dict]:
    """Prepare the API input for a single game."""
    champion_names = [
        # Blue team
        row["blue_top_name"],
        row["blue_jungle_name"],
        row["blue_mid_name"],
        row["blue_bot_name"],
        row["blue_support_name"],
        # Red team
        row["red_top_name"],
        row["red_jungle_name"],
        row["red_mid_name"],
        row["red_bot_name"],
        row["red_support_name"],
    ]

    # Map champion names to IDs
    champion_ids = []
    for name in champion_names:
        champ_id = match_champion_name(name, champion_mapping)
        if champ_id is None:
            return None  # Skip if any champion can't be matched
        champion_ids.append(champ_id)

    # Return the API input
    return {
        "champion_ids": champion_ids,
        "numerical_elo": 0,  # Diamond 2 +
        "patch": row["patch_version_substring"],
        # TODO: it actually has better accuraccy on queue 420?
        "queueId": 420,
        # "queueId": 700,
    }


def send_batch_predictions(api_inputs: List[Dict]) -> List[float]:
    """Send a batch of predictions to the model server."""
    try:
        # Add API key in the header
        headers = {"X-API-Key": "example_token"}
        response = requests.post(MODEL_URL, json=api_inputs, headers=headers)
        response.raise_for_status()
        return [pred["win_probability"] for pred in response.json()]
    except Exception as e:
        print(f"Error sending batch prediction: {e}")
        print(f"Request payload: {json.dumps(api_inputs)}")
        # Return None for each input in case of error
        return [None] * len(api_inputs)


def evaluate_model(df: pd.DataFrame) -> Tuple[float, float]:
    """Calculate log loss and accuracy for model predictions."""
    # Filter out rows with missing predictions
    eval_df = df[df["model_prediction"].notna()].copy()

    # Calculate metrics
    y_true = eval_df["blue_team_won"].astype(int)
    y_pred = eval_df["model_prediction"]

    # Convert probabilities to binary predictions using 0.5 threshold
    y_pred_binary = (y_pred > 0.5).astype(int)

    # Calculate metrics
    loss = log_loss(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred_binary)

    return loss, acc


def main():
    # Load dataset
    print("Loading dataset...")
    df = pd.read_csv("../data/comp-games.csv")

    # Filter for relevant patches - fixed to handle numeric patch versions
    print(f"Filtering for patches: {RELEVANT_PATCHES}")

    # Format the patch versions in the dataframe
    df["patch_version_substring"] = (
        df["patch_version_substring"].astype(str).apply(format_patch_version)
    )
    patch_pattern = "|".join(RELEVANT_PATCHES)
    df = df[df["patch_version_substring"].str.match(patch_pattern)]

    if len(df) == 0:
        print(f"No games found for patches {RELEVANT_PATCHES}")
        return

    print(f"Found {len(df)} games for evaluation")

    # Load champion mapping
    champion_mapping = load_champion_mapping()

    # Prepare inputs for all games
    api_inputs = []
    valid_indices = []

    for idx, row in df.iterrows():
        api_input = prepare_api_input(row, champion_mapping)
        if api_input is not None:
            api_inputs.append(api_input)
            valid_indices.append(idx)

    print(f"Prepared {len(api_inputs)} valid inputs for prediction")

    # Initialize predictions array
    all_predictions = np.full(len(df), np.nan)

    # Process in batches
    for i in range(0, len(api_inputs), BATCH_SIZE):
        batch_inputs = api_inputs[i : i + BATCH_SIZE]
        batch_indices = valid_indices[i : i + BATCH_SIZE]

        print(
            f"Processing batch {i//BATCH_SIZE + 1}/{(len(api_inputs) + BATCH_SIZE - 1)//BATCH_SIZE}"
        )
        batch_predictions = send_batch_predictions(batch_inputs)

        # Store predictions
        for idx, pred in zip(batch_indices, batch_predictions):
            if pred is not None:
                all_predictions[df.index.get_indexer([idx])[0]] = pred

    # Add predictions to dataframe
    df["model_prediction"] = all_predictions

    # Evaluate model
    filtered_df = df[df["model_prediction"].notna()]
    if len(filtered_df) > 0:
        loss, acc = evaluate_model(filtered_df)
        print(f"\nModel evaluation on {len(filtered_df)} games:")
        print(f"Log Loss: {loss:.4f}")
        print(f"Accuracy: {acc:.4f}")

        # Add a column that indicates if model prediction is correct
        filtered_df["model_correct"] = (
            (filtered_df["model_prediction"] > 0.5)
            == (filtered_df["blue_team_won"] == 1)
        ).astype(int)
    else:
        print("No valid predictions to evaluate")

    # Save results to CSV
    filtered_df.to_csv(OUTPUT_FILE, index=False)
    print(f"Results saved to {OUTPUT_FILE}")

In [9]:
main()

Loading dataset...
Filtering for patches: ['15.01', '15.02', '15.03', '15.04', '15.05', '14.07', '14.08', '14.09', '14.10', '14.11', '14.12', '14.13', '14.14', '14.15', '14.16', '14.17', '14.18', '14.19', '14.20', '14.21', '14.22', '14.23', '14.24']
Found 3157 games for evaluation
Prepared 3157 valid inputs for prediction
Processing batch 1/2
Processing batch 2/2

Model evaluation on 3157 games:
Log Loss: 0.6964
Accuracy: 0.5322
Results saved to ../data/comp-games-with-predictions.csv
