In [9]:
import pandas as pd
import numpy as np
import json
import os

from PIL import Image
import requests
from io import BytesIO

from torch.utils.data import Dataset

In [None]:
with open(os.path.join("game_data", "items.json"), "r", encoding="utf-8") as fp:
    item_dict = json.load(fp)
    ITEMS = {
        item_id: item_info
        for item_id, item_info in item_dict.items()
        if item_info["tier"] >= 3 or (item_info["tier"] == 2 and \
            ("BOOTS" in item_info["rank"] or item_info["id"] in [3004, 3003, 3119, 3010, 3866])) or \
            ("STARTER" in item_info["rank"])
    }

with open(os.path.join("game_data", "champions.json"), "r", encoding="utf-8") as fp:
    CHAMP_DICT = json.load(fp)
    CHAMP_NAME_TO_ID = {c: CHAMP_DICT[c]["id"] for c in CHAMP_DICT.keys()}
    CHAMP_ID_TO_NAME = {CHAMP_DICT[c]["id"]: c for c in CHAMP_DICT.keys()}
    CHAMPION_ICONS = {c: CHAMP_DICT[c]["icon"] for c in CHAMP_DICT.keys()}

with open(os.path.join("game_data", "perks.json"), "r", encoding="utf-8") as fp:
    PERK_ID_TO_NAME = json.load(fp)

def get_item_name(item_id: int):
    return ITEMS.get(str(int(item_id)), "")

def get_item_icon(item_id: int):
    if not item_id:
        return None
    url = ITEMS.get(item_id, {}).get("icon", "https://raw.communitydragon.org/latest/plugins/rcp-be-lol-game-data/global/default/data/spells/icons2d/summoner_empty.png")
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        return img
    except:
        return Image.new("RGB", (64, 64))

def get_champion_id(champion_name: str):
    if champion_name is None:
        return None
    return CHAMP_NAME_TO_ID.get(champion_name, "")

def get_champion_name(champion_id: int):
    return CHAMP_ID_TO_NAME.get(champion_id)

def get_champion_icon(champion_name: int):
    if not champion_name:
        return None
    url = CHAMPION_ICONS.get(champion_name)
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        return img
    except:
        return Image.new("RGB", (64, 64))

In [11]:
LANES = ["TOP", "JUNGLE", "MIDDLE", "BOTTOM", "UTILITY"]
LIST_COLUMNS = ["items", "perks", "skills"]
LIST_SIZE = {"items": 6, "perks": 6, "skills": 4}

def load_timeline_files(filepaths):
    dataframes = []
    for path in filepaths:
        with open(path, "r", encoding="UTF-8") as fp:
            match_data = json.load(fp)
            for match_id, timeline in match_data.items():
                new_timeline = {
                    i: {**frame, "timeframe": tf}
                    for i, (tf, frame) in enumerate(timeline.items())
                }
                match_data[match_id] = new_timeline
            df = pd.DataFrame(match_data)
            dataframes.append(df)
    return pd.concat(dataframes, axis=1)

def flatten_match(row):
    frames = [int(col) for col in row.index if str(col).isdigit() and pd.notna(row[col])]
    return pd.concat(
        [pd.Series({"matchId": row["matchId"], **row[frame]}) for frame in frames],
        axis=1
    )

def generate_timeframe_rows(df):
    df = df.T.reset_index(names="matchId")
    flat = [flatten_match(row) for _, row in df.iterrows()]
    return pd.concat([df.T for df in flat], ignore_index=True)

def build_player_columns(player_id, player_info):
    flat = {
        f"{player_id}_{k}": v
        for k, v in player_info.items()
        if k not in LIST_COLUMNS
    }
    for k in LIST_COLUMNS:
        values = player_info.get(k, [])
        flat.update({
            f"{player_id}_{k}_{i}": values[i] if i < len(values) else 0
            for i in range(LIST_SIZE[k])
        })
    return flat

def expand_participants(df):
    return df.apply(
        lambda row: pd.concat(
            [row[["matchId", "timeframe"]]] +
            [pd.Series(build_player_columns(pid, row[str(pid)])) for pid in range(1, 11)]
        ), axis=1
    )

def keep_valid_matches(df):
    # Matches with 2 or more champions on each lane (hopefully one on each size)
    lane_cols = [col for col in df.columns if col.endswith("_lane")]
    lane_counts = df[lane_cols].apply(pd.Series.value_counts, axis=1)
    return df[lane_counts[LANES].eq(2).all(axis=1)]

def add_stats_cols(df):
    df = df.copy()
    new_columns = {}
    team_100_gold = pd.DataFrame({
        i: df[f"{i}_goldEarned"].where(df[f"{i}_teamId"] == 100, 0)
        for i in range(1, 11)
    }).sum(axis=1)
    team_200_gold = pd.DataFrame({
        i: df[f"{i}_goldEarned"].where(df[f"{i}_teamId"] == 200, 0)
        for i in range(1, 11)
    }).sum(axis=1)
    for i in range(1, 11):
        new_columns[f"{i}_kda"] = (
            df[f"{i}_kills"] + df[f"{i}_assists"]
        ) / df[f"{i}_deaths"].replace(0, 1)

        player_team_is_100 = df[f"{i}_teamId"] == 100
        new_columns[f"{i}_relativeGold"] = df[f"{i}_goldEarned"] / (
            team_100_gold.where(player_team_is_100, team_200_gold)
        )
    df = pd.concat([df, pd.DataFrame(new_columns)], axis=1)
    return df

# dont modify anything below this line
filepaths = [
    "features/timeline/timeline_features_ID492.json",
    "features/timeline/timeline_features_ID493.json"
]

#filepaths = [os.path.join("features", "timeline", fn) for fn in os.listdir(os.path.join("features", "timeline"))[:100]]

df = load_timeline_files(filepaths)
df_timeframes = generate_timeframe_rows(df)
df_participants = expand_participants(df_timeframes)
df_valid = keep_valid_matches(df_participants)
df_stats = add_stats_cols(df_valid)
df_stats

Unnamed: 0,matchId,timeframe,1_kills,1_deaths,1_assists,1_currentGold,1_goldEarned,1_level,1_minionsKilled,1_dragonKills,...,6_kda,6_relativeGold,7_kda,7_relativeGold,8_kda,8_relativeGold,9_kda,9_relativeGold,10_kda,10_relativeGold
0,KR_7679502919,60001,0,0,0,0,500,1,0,0,...,0.00,0.200000,0.000000,0.200000,0.000000,0.200000,0.000000,0.200000,0.000000,0.200000
1,KR_7679502919,240122,0,0,0,660,1160,4,16,0,...,0.00,0.207103,0.000000,0.229646,0.000000,0.210347,0.000000,0.202400,0.000000,0.150503
2,KR_7679502919,360158,0,1,0,1289,1789,6,35,0,...,1.00,0.231830,1.000000,0.220447,0.000000,0.209586,0.000000,0.196220,0.000000,0.141917
3,KR_7679502919,420164,0,1,1,374,2224,6,42,0,...,2.00,0.229672,2.000000,0.242801,0.000000,0.205514,0.000000,0.189059,0.000000,0.132954
4,KR_7679502919,540203,0,1,1,514,2764,7,54,0,...,1.00,0.229920,3.000000,0.248059,0.000000,0.196653,1.000000,0.185207,0.500000,0.140161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1007,KR_7669628026,1320529,4,2,1,216,12229,15,222,1,...,0.75,0.215398,1.250000,0.177198,1.000000,0.206162,2.666667,0.261166,3.000000,0.140075
1008,KR_7669628026,1380534,5,2,2,1055,13038,15,231,1,...,1.25,0.212604,1.400000,0.178801,1.666667,0.217783,2.250000,0.254080,3.500000,0.136732
1009,KR_7669628026,1560605,6,2,4,169,14886,16,260,1,...,1.00,0.213047,1.500000,0.172799,2.000000,0.226872,1.800000,0.251674,4.000000,0.135608
1010,KR_7669628026,1620625,6,2,5,959,15676,17,272,1,...,1.00,0.213490,1.285714,0.168969,2.000000,0.231912,1.800000,0.249528,4.000000,0.136101


In [14]:
BASE_CHAMPION_OHE_DICT = {}
for prefix in ["player", "ally", "enemy",]:
    BASE_CHAMPION_OHE_DICT.update({
        f"{prefix}_{champName}": 0
        for champName in CHAMP_NAME_TO_ID.keys()
    })

BASE_PERK_OHE_DICT = {f"perk_{perk}": 0 for perk in PERK_ID_TO_NAME}

BASE_ITEM_OHE_DICT = {f"item_{item}": 0 for item in ITEMS}

BASE_OHE_DICT = {
    **BASE_CHAMPION_OHE_DICT,
    **BASE_PERK_OHE_DICT,
    **BASE_ITEM_OHE_DICT
}

KEPT_PLAYER_COLS = [
    "kda", "level", "goldEarned", "relativeGold",
    "minionsKilled", "participantId", "origin"
] + [
    f"items_{i}" for i in range(LIST_SIZE["items"]) # 6
] + [
    f"skills_{i}" for i in range(LIST_SIZE["skills"]) # 4
]

OBJECTIVES_COLS = ["voidgrubKills", "atakhanKills", "baronKills", "dragonKills", "structuresKilled", "heraldKills"]
RANKS = [None, "IRON", "BRONZE", "SILVER", "GOLD", "PLATINUM", "EMERALD", "DIAMOND", "MASTER", "GRANDMASTER", "CHALLENGER"]

# Takes some time to process
def get_team_champions_and_objectives(row):
    champions = {
        team: {lane: {} for lane in LANES}
        for team in [100, 200]
    }
    objectives = {
        team: {}
        for team in [100, 200]
    }
    for participantId in range(1, 11):
        team = row[f"{participantId}_teamId"]
        lane = row[f"{participantId}_lane"]
        for feature in ["championName", "kda", "level", "minionsKilled"]:
            champions[team][lane][feature] = row[f"{participantId}_{feature}"]
        # Objective information is the same for all participants in the same team
        if not objectives[team]:
            for obj in OBJECTIVES_COLS:
                objectives[team][obj] = row[f"{participantId}_{obj}"]
    return champions, objectives

def build_new_row(row, base_dict, kept_cols, participantId, champions, objectives):
    if not row[f"{participantId}_boughtItem"]:
        return None

    team = row[f"{participantId}_teamId"]
    playerLane = row[f"{participantId}_lane"]
    
    ally_champs = champions[team]
    enemy_champs = champions[200 if team == 100 else 100]

    ally_obj = objectives[team]
    enemy_obj = objectives[200 if team == 100 else 100]

    new_row = {**base_dict}

    # Player info
    player_champion = get_champion_name(row[f"{participantId}_championId"])
    new_row[f"player_{player_champion}"] = 1
    for c in kept_cols:
        new_row[c] = row[f"{participantId}_{c}"]
    new_row["tier"] = RANKS.index(row[f"{participantId}_tier"])

    # OHE for allies and enemies
    for champ in [info["championName"] for info in ally_champs.values()]:
        if champ != player_champion:
            new_row[f"ally_{champ}"] = 1
    for champ in [info["championName"] for info in enemy_champs.values()]:
        new_row[f"enemy_{champ}"] = 1

    # OHE for perk
    for c in [f"{participantId}_perks_{i}" for i in range(LIST_SIZE["perks"])]:
        perk_id = row[c]
        new_row[f"perk_{perk_id}"] = 1

    # OHE for item
    for c in [f"{participantId}_items_{i}" for i in range(LIST_SIZE["items"])]:
        item = row[c]
        if item > 0:
            new_row[f"item_{item}"] = 1

    # Known info about other players (outside of items and perks)
    ally_index = 0
    for enemy_index, lane in enumerate(LANES):
        for feature in ["kda", "level", "minionsKilled"]:
            if lane != playerLane:
                new_row[f"ally_{ally_index}_{feature}"] = ally_champs[lane][feature]
            new_row[f"enemy_{enemy_index}_{feature}"] = ally_champs[lane][feature]
        if lane != playerLane:
            ally_index += 1

    # Objectives
    for feature in ally_obj.keys():
        new_row[f"ally_{feature}"] = ally_obj[feature]
        new_row[f"enemy_{feature}"] = enemy_obj[feature]

    # OHE for lane
    new_row[playerLane] = 1

    return new_row

def expand_match(row):
    team_champions, team_objectives = get_team_champions_and_objectives(row)
    match_info = {"matchId": row["matchId"], "timeframe": int(row["timeframe"])}
    new_rows = []

    for playerId in range(1, 11):
        row_data = build_new_row(row, BASE_OHE_DICT, KEPT_PLAYER_COLS, playerId, team_champions, team_objectives)
        if row_data:
            row_data.update(match_info)
            new_rows.append(row_data)

    return pd.DataFrame(new_rows)

expanded_list = df_stats.apply(expand_match, axis=1).tolist()
player_df = pd.concat(expanded_list, ignore_index=True, axis=0).fillna(0)
player_df = player_df.reindex(sorted(player_df.columns, key=lambda x: x not in ["matchId", "timeframe"]), axis=1)
print(f"Number of rows generated: {len(player_df)}")
display(player_df)

Number of rows generated: 2326


Unnamed: 0,matchId,timeframe,player_Aatrox,player_Ahri,player_Akali,player_Akshan,player_Alistar,player_Ambessa,player_Amumu,player_Anivia,...,enemy_dragonKills,ally_structuresKilled,enemy_structuresKilled,ally_heraldKills,enemy_heraldKills,TOP,JUNGLE,MIDDLE,BOTTOM,UTILITY
0,KR_7679502919,60001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0
1,KR_7679502919,60001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0
2,KR_7679502919,60001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,1.0,0.0,0.0
3,KR_7679502919,60001,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,1.0,0.0
4,KR_7679502919,60001,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2321,KR_7669628026,1620625,0,0,0,0,0,0,0,0,...,1,2,12,0,1,0.0,0.0,1.0,0.0,0.0
2322,KR_7669628026,1620625,0,0,0,0,0,0,0,0,...,1,2,12,0,1,0.0,0.0,0.0,1.0,0.0
2323,KR_7669628026,1620625,0,0,0,0,0,0,0,0,...,1,2,12,0,1,0.0,0.0,0.0,0.0,1.0
2324,KR_7669628026,1717643,0,0,0,0,0,0,0,1,...,2,15,2,1,0,0.0,0.0,1.0,0.0,0.0


In [15]:
# keep the top ~70% of the matches
HIGH_PERFORMANCE_INDICATORS = {
    "TOP": {
        "kda": 1.25,
        "relativeGold": 0.175
    },
    "JUNGLE": {
        "kda": 1.75,
        "relativeGold": 0.175
    },
    "MIDDLE": {
        "kda": 1.25,
        "relativeGold": 0.175
    },
    "BOTTOM": {
        "kda": 1.75,
        "relativeGold": 0.175
    },
    "UTILITY": {
        "kda": 1.75,
        "relativeGold": 0.125
    },
}

def is_high_performance(row):
    lane = "".join([lane for lane in LANES if row[lane] == 1])
    for indicator, min_val in HIGH_PERFORMANCE_INDICATORS[lane].items():
        if row[indicator] < min_val:
            return False
    return True

def keep_high_performance(df):
    df_last_timeframe = df.groupby(
            ["matchId", "participantId"], as_index=False
        )["timeframe"].max()
    latest_rows = pd.merge(
        df_last_timeframe,
        df,
        on=["matchId", "participantId", "timeframe"],
        how="inner"
    )
    high_df = latest_rows[latest_rows.apply(is_high_performance, axis=1)]
    filtered_participants = high_df[["matchId", "participantId"]].drop_duplicates()
    filtered_df = pd.merge(
        filtered_participants,
        df,
        on=["matchId", "participantId"],
        how="inner"
    )
    return filtered_df

high_df = keep_high_performance(player_df)
high_df.head()

Unnamed: 0,matchId,participantId,timeframe,player_Aatrox,player_Ahri,player_Akali,player_Akshan,player_Alistar,player_Ambessa,player_Amumu,...,enemy_dragonKills,ally_structuresKilled,enemy_structuresKilled,ally_heraldKills,enemy_heraldKills,TOP,JUNGLE,MIDDLE,BOTTOM,UTILITY
0,KR_7654434335,1,60009,0,0,0,0,0,0,0,...,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0
1,KR_7654434335,1,420202,0,0,0,0,0,0,0,...,0,0,0,0,0,1.0,0.0,0.0,0.0,0.0
2,KR_7654434335,1,1020320,0,0,0,0,0,0,0,...,1,1,1,1,0,1.0,0.0,0.0,0.0,0.0
3,KR_7654434335,1,1320422,0,0,0,0,0,0,0,...,1,3,3,1,0,1.0,0.0,0.0,0.0,0.0
4,KR_7654434335,2,60009,0,0,0,0,0,0,0,...,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0


In [16]:
def build_dataset(file_list, out_path="features/dataset/timeline", batch_size=4):
    for start in range(0, len(file_list), batch_size):
        end = min(start + batch_size, len(file_list))
        batch_files = file_list[start:end]

        df = load_timeline_files(batch_files)
        df = generate_timeframe_rows(df)
        df = expand_participants(df)
        df = keep_valid_matches(df)
        df = add_stats_cols(df)
        expanded_list = df.apply(expand_match, axis=1).tolist()
        df = pd.concat(expanded_list, ignore_index=True, axis=0).fillna(0)
        df = keep_high_performance(df)

        start_file = os.path.basename(batch_files[0])
        end_file = os.path.basename(batch_files[-1])
        start_id = start_file.replace("timeline_features_ID", "").replace(".json", "")
        end_id = end_file.replace("timeline_features_ID", "").replace(".json", "")

        out_file = os.path.join(out_path, f"{start_id}_{end_id}.parquet")
        df.to_parquet(out_file)

In [17]:
class TimelineDataset(Dataset):
    def __init__(self, file_list, out_path="features/dataset/timeline", batch_size=4, clean=False):
        self.data_index = []
        os.makedirs(out_path, exist_ok=True)

        # Step 1: Clean if requested
        if clean:
            for fn in os.listdir(out_path):
                if fn.endswith(".parquet"):
                    os.remove(os.path.join(out_path, fn))

        # Step 2: Load matchIds already processed
        processed_ids = set()
        for fn in os.listdir(out_path):
            if fn.endswith(".parquet"):
                fpath = os.path.join(out_path, fn)
                try:
                    df = pd.read_parquet(fpath, columns=["matchId"])
                    processed_ids.update(df["matchId"].unique())
                    self.data_index.extend([(fpath, i) for i in range(len(df))])
                except Exception as e:
                    print(f"Warning: Couldn't read {fn}: {e}")

        print(f"Already processed {len(processed_ids)} matchIds.")

        # Step 3: Find the first unprocessed file (stop as soon as one overlaps)
        start_index = 0
        for idx, path in enumerate(file_list):
            with open(path, "r", encoding="utf-8") as fp:
                match_ids = set(json.load(fp).keys())
            if match_ids & processed_ids:
                start_index = idx + 1  # Start processing from the *next* file
            else:
                break  # All previous files are new, we can start from here

        # Step 4: Process only unprocessed files
        remaining_files = file_list[start_index:]
        print(f"Processing {len(remaining_files)} new files...")

        for start in range(0, len(remaining_files), batch_size):
            end = min(start + batch_size, len(remaining_files))
            batch_files = remaining_files[start:end]
            print(f"Starting processing from {batch_files[0]} up to {batch_files[-1]}")

            try:
                print("load_timeline_files")
                df = load_timeline_files(batch_files)
                print("generate_timeframe_rows")
                df = generate_timeframe_rows(df)
                print("expand_participants")
                df = expand_participants(df)
                print("keep_valid_matches")
                df = keep_valid_matches(df)
                print("add_stats_cols")
                df = add_stats_cols(df)
                print("expand_match")
                expanded_list = df.apply(expand_match, axis=1).tolist()
                print("concat")
                df = pd.concat(expanded_list, ignore_index=True, axis=0).fillna(0)
                print("keep_high_performance")
                df = keep_high_performance(df)

                start_file = os.path.basename(batch_files[0])
                end_file = os.path.basename(batch_files[-1])

                start_id = start_file.replace("timeline_features_ID", "").replace(".json", "")
                end_id = end_file.replace("timeline_features_ID", "").replace(".json", "")

                out_file = os.path.join(out_path, f"{start_id}_{end_id}.parquet")
                df.to_parquet(out_file)

                self.data_index.extend([(out_file, i) for i in range(len(df))])
                print(f"Appended {len(df)} rows, from {start_file} up to {end_file}")
            except KeyboardInterrupt:
                break
            except Exception as e:
                print(f"Error when processing from {batch_files[0]} up to {batch_files[-1]}")
                print(str(e))
                continue

    def __len__(self):
        return len(self.data_index)

    def __getitem__(self, idx):
        fpath, row_idx = self.data_index[idx]
        row = pd.read_parquet(fpath, skiprows=row_idx, nrows=1)
        return row.iloc[0].to_dict()

In [None]:
import re
folder = os.path.join("features", "timeline")
files = [os.path.join(folder, fn) for fn in os.listdir(folder)]
files = sorted(files, key=lambda x: int(re.search(r'ID(\d+)', x).group(1)))
dataset = TimelineDataset(files, batch_size=500, out_path="features/dataset/timeline")

Already processed 79519 matchIds.
Processing 500 new files...
Starting processing from features\timeline\timeline_features_ID0.json up to features\timeline\timeline_features_ID499.json
load_timeline_files
generate_timeframe_rows
expand_participants


In [21]:
df = pd.read_parquet("features/dataset/timeline/500_999.parquet")
df

Unnamed: 0,matchId,participantId,player_Aatrox,player_Ahri,player_Akali,player_Akshan,player_Alistar,player_Ambessa,player_Amumu,player_Anivia,...,ally_structuresKilled,enemy_structuresKilled,ally_heraldKills,enemy_heraldKills,JUNGLE,timeframe,MIDDLE,BOTTOM,TOP,UTILITY
0,BR1_3097977881,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,60025,0.0,0.0,1.0,0.0
1,BR1_3097977881,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0.0,720321,0.0,0.0,1.0,0.0
2,BR1_3097977881,1,0,0,0,0,0,0,0,0,...,9,1,1,0,0.0,1200464,0.0,0.0,1.0,0.0
3,BR1_3097977881,2,0,0,0,0,0,0,0,0,...,0,0,0,0,1.0,60025,0.0,0.0,0.0,0.0
4,BR1_3097977881,2,0,0,0,0,0,0,0,0,...,0,0,0,0,1.0,660310,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
359135,TR1_1622857142,10,0,0,0,0,0,0,0,0,...,1,1,0,0,0.0,960339,0.0,0.0,0.0,1.0
359136,TR1_1622857142,10,0,0,0,0,0,0,0,0,...,1,4,0,1,0.0,1200383,0.0,0.0,0.0,1.0
359137,TR1_1622857142,10,0,0,0,0,0,0,0,0,...,2,6,0,1,0.0,1560460,0.0,0.0,0.0,1.0
359138,TR1_1622857142,10,0,0,0,0,0,0,0,0,...,2,6,0,1,0.0,1740535,0.0,0.0,0.0,1.0
