In [None]:
import os
import glob
import pyarrow.parquet as pq
import numpy as np
from collections import defaultdict
from typing import Dict, List
from utils.match_prediction import PREPARED_DATA_DIR

def generate_role_champion_mapping(
    prepared_data_dir: str = PREPARED_DATA_DIR,
    min_playrate_threshold: float = 0.001  # 0.1% minimum playrate
) -> Dict[str, List[int]]:
    """
    Generate role-champion mapping based on latest patch data.
    
    Args:
        prepared_data_dir: Directory containing the prepared match data
        min_playrate_threshold: Minimum playrate threshold for a champion to be considered viable in a role
    """
    # Get all parquet files
    data_files = sorted(glob.glob(os.path.join(prepared_data_dir, "train", "train*.parquet")))
    
    # Initialize counters
    latest_patch = -1
    champion_role_games = defaultdict(lambda: defaultdict(int))  # {champion_id: {role: game_count}}
    total_games = 0
    
    # Process each file
    for file_path in data_files:
        parquet_file = pq.ParquetFile(file_path)
        
        for batch in parquet_file.iter_batches():
            df_chunk = batch.to_pandas()
            
            for _, row in df_chunk.iterrows():
                patch = row['numerical_patch']
                
                # If we find a newer patch, reset our statistics
                if patch > latest_patch:
                    print(f"Found newer patch {patch}, resetting statistics")
                    latest_patch = patch
                    champion_role_games.clear()
                    total_games = 0
                elif patch < latest_patch:
                    continue
                
                # Update statistics for this game
                champion_ids = row['champion_ids']
                roles = ['TOP', 'JUNGLE', 'MID', 'BOT', 'UTILITY'] * 2  # Both teams
                
                for champ_id, role in zip(champion_ids, roles):
                    champion_role_games[champ_id][role] += 1
                total_games += 1

    print(f"\nProcessed {total_games} games from patch {latest_patch}")
    
    # Calculate play rates and assign primary roles
    champion_primary_roles = defaultdict(str)
    role_champions: Dict[str, List[int]] = {
        'TOP': [],
        'JUNGLE': [],
        'MID': [],
        'BOT': [],
        'UTILITY': []
    }
    
    # For each champion, find their most played role exceeding the threshold
    for champ_id, role_games in champion_role_games.items():
        total_champ_games = sum(role_games.values())
        max_playrate = 0
        primary_role = None
        
        for role, games in role_games.items():
            playrate = games / total_games
            if playrate > max_playrate and playrate >= min_playrate_threshold:
                max_playrate = playrate
                primary_role = role
        
        if primary_role:
            role_champions[primary_role].append(champ_id)
            champion_primary_roles[champ_id] = primary_role
    
    # Print summary
    print("\nRole assignments summary:")
    for role, champions in role_champions.items():
        print(f"{role}: {len(champions)} champions")
        
    # Sort champion lists for consistency
    for role in role_champions:
        role_champions[role].sort()
    
    return role_champions


In [None]:
# Example usage:
role_mapping = generate_role_champion_mapping()

In [None]:
print(role_mapping)

In [None]:
import pickle
from utils.match_prediction import ENCODERS_PATH

# Load label encoders
with open(ENCODERS_PATH, "rb") as f:
    label_encoders = pickle.load(f)

# Convert role_mapping values from encoded to original champion IDs
role_mapping_serializable = {
    role: [int(label_encoders["champion_ids"].inverse_transform([champ_id])[0]) for champ_id in champions]
    for role, champions in role_mapping.items()
}

role_mapping_serializable

In [None]:
import json
from utils.rl import ROLE_CHAMPIONS_PATH

# Save to JSON
with open(ROLE_CHAMPIONS_PATH, "w") as f:
    json.dump(role_mapping_serializable, f)