In [None]:
import os
import glob
import pyarrow.parquet as pq
import numpy as np
import pickle
import json
from collections import defaultdict
from typing import Dict, List, Tuple
from utils.match_prediction import PREPARED_DATA_DIR, ENCODERS_PATH
from utils.rl import ROLE_CHAMPIONS_PATH


def generate_role_statistics(
    prepared_data_dir: str = PREPARED_DATA_DIR,
) -> Tuple[Dict[int, Dict[str, float]], Dict[int, Dict[str, float]]]:
    """
    Generate champion role statistics based on latest patch data.

    Args:
        prepared_data_dir: Directory containing the prepared match data

    Returns:
        Tuple containing:
        - champion_role_rates: Dict mapping champion_id to role percentages (sums to 100% per champion)
        - champion_role_playrates: Dict mapping champion_id to global role playrates
    """
    # Get all parquet files
    data_files = sorted(
        glob.glob(os.path.join(prepared_data_dir, "train", "train*.parquet"))
    )

    # Initialize counters
    latest_patch = -1
    champion_role_games = defaultdict(
        lambda: defaultdict(int)
    )  # {champion_id: {role: game_count}}
    total_games = 0

    # Process each file
    for file_path in data_files:
        parquet_file = pq.ParquetFile(file_path)

        for batch in parquet_file.iter_batches():
            df_chunk = batch.to_pandas()

            for _, row in df_chunk.iterrows():
                patch = row["numerical_patch"]

                # If we find a newer patch, reset our statistics
                if patch > latest_patch:
                    print(f"Found newer patch {patch}, resetting statistics")
                    latest_patch = patch
                    champion_role_games.clear()
                    total_games = 0
                elif patch < latest_patch:
                    continue

                # Update statistics for this game
                champion_ids = row["champion_ids"]
                roles = ["TOP", "JUNGLE", "MID", "BOT", "UTILITY"] * 2  # Both teams

                for champ_id, role in zip(champion_ids, roles):
                    champion_role_games[champ_id][role] += 1
                total_games += 1

    print(f"\nProcessed {total_games} games from patch {latest_patch}")

    # Calculate statistics
    champion_role_rates = {}  # Percentage per champion (sums to 100%)
    champion_role_playrates = {}  # Global play rates

    # For each champion, calculate both metrics
    for champ_id, role_games in champion_role_games.items():
        total_champ_games = sum(role_games.values())

        # Initialize dictionaries for this champion
        champion_role_rates[champ_id] = {}
        champion_role_playrates[champ_id] = {}

        # Calculate both metrics for each role
        for role in ["TOP", "JUNGLE", "MID", "BOT", "UTILITY"]:
            games = role_games[role]
            # Per-champion percentage (divided by champion's total games)
            champion_role_rates[champ_id][role] = (
                (games / total_champ_games * 100) if total_champ_games > 0 else 0
            )
            # Global play rate (divided by total games)
            champion_role_playrates[champ_id][role] = (
                (games / total_games * 100) if total_games > 0 else 0
            )

    # Print summary
    print("\nRole statistics summary:")
    print(f"Processed {len(champion_role_rates)} champions")

    # Example statistics for verification
    print("\nExample statistics for first champion:")
    first_champ = list(champion_role_rates.keys())[0]
    print(f"\nChampion {first_champ} role percentages (should sum to 100%):")
    print(champion_role_rates[first_champ])
    print(f"Sum: {sum(champion_role_rates[first_champ].values()):.1f}%")

    print(f"\nChampion {first_champ} global play rates:")
    print(champion_role_playrates[first_champ])

    return champion_role_rates, champion_role_playrates


def save_role_statistics(
    champion_role_rates: Dict,
    champion_role_playrates: Dict,
    encoders_path: str = ENCODERS_PATH,
):
    """
    Save role statistics to JSON files, converting from encoded to original champion IDs.
    """
    # Load label encoders
    with open(encoders_path, "rb") as f:
        label_encoders = pickle.load(f)

    # Convert champion IDs and prepare for serialization
    rates_serializable = {
        str(int(label_encoders["champion_ids"].inverse_transform([champ_id])[0])): rates
        for champ_id, rates in champion_role_rates.items()
    }

    playrates_serializable = {
        str(int(label_encoders["champion_ids"].inverse_transform([champ_id])[0])): rates
        for champ_id, rates in champion_role_playrates.items()
    }

    # Save to JSON
    with open(
        os.path.join(os.path.dirname(ROLE_CHAMPIONS_PATH), "champion_role_rates.json"),
        "w",
    ) as f:
        json.dump(rates_serializable, f, indent=2)

    with open(
        os.path.join(
            os.path.dirname(ROLE_CHAMPIONS_PATH), "champion_role_playrates.json"
        ),
        "w",
    ) as f:
        json.dump(playrates_serializable, f, indent=2)

In [None]:
role_rates, role_playrates = generate_role_statistics()

In [None]:
role_rates

In [None]:
role_playrates

In [None]:
save_role_statistics(role_rates, role_playrates)