# **Initial Setup for API Access and Project Directory Structure**

I set up my Riot API connection, define the time range of matches I want to analyze, and specify where all data will be stored. This cell basically prepares the entire workspace so I can start downloading matches without running into missing-folder issues.

In [None]:
import os
import time
import json
from pathlib import Path
from datetime import datetime, timezone
import pandas as pd
from tqdm import tqdm
from riotwatcher import LolWatcher, ApiError
API_KEY = "------"
assert API_KEY and API_KEY.startswith("RGAPI-"), "Please set a valid Riot API key."
watcher = LolWatcher(API_KEY)
REGION   = "europe"
PLATFORM = "euw1"
QUEUE_ID = 420
START = int(datetime(2024, 1, 1, tzinfo=timezone.utc).timestamp())
END   = int(datetime(2025, 12, 31, 23, 59, 59, tzinfo=timezone.utc).timestamp())
DATA_ROOT     = Path("data")
RAW_MATCH_DIR = DATA_ROOT / "raw" / "matches"
RAW_TL_DIR    = DATA_ROOT / "raw" / "timelines"
PROC_DIR      = DATA_ROOT / "processed"
INTERIM_DIR   = DATA_ROOT / "interim"

for d in [RAW_MATCH_DIR, RAW_TL_DIR, PROC_DIR, INTERIM_DIR]:
    d.mkdir(parents=True, exist_ok=True)

print("Folders ready.")

Folders ready.


# **Pulling High-Elo Player PUUIDs as Seed Accounts**

I fetch Challenger, Grandmaster, or Master players (whichever responds first) and extract their PUUIDs to use as starting points for gathering match histories. This gives me a clean, high-quality pool of players whose games I want to analyze for objective-timing patterns.

In [None]:
def get_seed_puuids(limit=200):
    """
    Fetch challenger ladder entries and extract PUUIDs.
    """
    tiers = [
        ("challenger_by_queue",   watcher.league.challenger_by_queue),
        ("grandmaster_by_queue", watcher.league.grandmaster_by_queue),
        ("masters_by_queue",     watcher.league.masters_by_queue),
    ]

    entries = []
    for name, func in tiers:
        try:
            data = func(PLATFORM, "RANKED_SOLO_5x5")
            ents = data.get("entries", [])
            if ents:
                print(f"Found {len(ents)} entries in {name}")
                entries = ents
                break
        except Exception as e:
            print(f"Failed {name}: {e}")
            continue

    if not entries:
        print("No ladder entries found — check API key / region / platform.")
        return []

    puuids = [e["puuid"] for e in entries[:limit] if "puuid" in e]
    puuids = list(dict.fromkeys(puuids))

    print(f"Collected {len(puuids)} unique PUUIDs from ladder.")
    return puuids


seed_puuids = get_seed_puuids(limit=100)
len(seed_puuids)


Found 300 entries in challenger_by_queue
Collected 100 unique PUUIDs from ladder.


100

# **Collecting Match IDs From Each Seed Player’s Ranked History**

I loop through every seed PUUID and pull their recent ranked-solo matches within my chosen time window, handling rate limits as I go. This gives me a master list of match IDs, which I save to a CSV so I don't have to re-query the API later.

In [None]:
MATCH_IDS_CSV = INTERIM_DIR / "match_ids.csv"

def get_match_ids(puuids, pages=3, per_page=100):
    """
    Fetch recent match IDs for each player PUUID in the given queue and time window. Saves to CSV for reuse.
    """
    recs = []

    for puuid in tqdm(puuids, desc="Fetching match IDs"):
        for p in range(pages):
            try:
                mids = watcher._match.matchlist_by_puuid(
                    REGION,
                    puuid,
                    start=p * per_page,
                    count=per_page,
                    queue=QUEUE_ID,
                    start_time=START,
                    end_time=END,
                )
                if not mids:
                    break
                recs += [{"puuid": puuid, "match_id": mid} for mid in mids]
                time.sleep(0.1)
            except ApiError as e:
                if getattr(e.response, "status_code", None) == 429:
                    print("Rate limit hit, sleeping 2s...")
                    time.sleep(2)
                break
            except Exception as ex:
                print(f"Error for {puuid}: {ex}")
                break

    df = pd.DataFrame(recs).drop_duplicates()
    df.to_csv(MATCH_IDS_CSV, index=False)
    print(f"Saved {len(df)} match records → {MATCH_IDS_CSV}")
    return df


match_ids_df = get_match_ids(seed_puuids, pages=3)
len(match_ids_df)


Fetching match IDs: 100%|██████████| 100/100 [04:38<00:00,  2.79s/it]

Saved 29721 match records → data/interim/match_ids.csv





29721

# **Downloading Full Match Data and Timelines With Automatic Retry Handling**

I pull down every match and its timeline using robust retry logic so temporary Riot API failures don’t break the pipeline. Each match gets cached locally as a JSON file, letting me safely process thousands of games without re-querying the API.

In [None]:
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

@retry(
    retry=retry_if_exception_type(ApiError),
    wait=wait_exponential(min=1, max=60),
    stop=stop_after_attempt(5),
    reraise=True,
)
def fetch_match(mid):
    """Fetch match details using RiotWatcher."""
    return watcher._match.by_id(REGION, mid)

@retry(
    retry=retry_if_exception_type(ApiError),
    wait=wait_exponential(min=1, max=60),
    stop=stop_after_attempt(5),
    reraise=True,
)
def fetch_timeline(mid):
    """Fetch timeline data using RiotWatcher."""
    return watcher._match.timeline_by_match(REGION, mid)

def download_all(match_ids, limit=None):
    """
    Download all matches and timelines, with retry + caching.
    'limit' lets you cap how many matches you actually download.
    """
    mids = match_ids[:limit] if limit is not None else match_ids

    ok_m = ok_tl = miss_tl = 0
    for mid in tqdm(mids, desc="Downloading matches & timelines"):
        m_path = RAW_MATCH_DIR / f"{mid}.json"
        tl_path = RAW_TL_DIR / f"{mid}.json"

        if not m_path.exists():
            try:
                m_data = fetch_match(mid)
                m_path.write_text(json.dumps(m_data))
                ok_m += 1
            except ApiError as e:
                print(f"Match fetch failed for {mid}: {getattr(e.response, 'status_code', '?')}")
                time.sleep(2)

        if not tl_path.exists():
            try:
                tl_data = fetch_timeline(mid)
                tl_path.write_text(json.dumps(tl_data))
                ok_tl += 1
            except ApiError:
                miss_tl += 1
                continue

        time.sleep(0.25)

    print(f"\nDownload complete — Matches: {ok_m}, Timelines: {ok_tl}, Missed TL: {miss_tl}")
    return ok_m, ok_tl, miss_tl


mids = match_ids_df["match_id"].unique().tolist()

MAX_MATCHES_TO_DOWNLOAD = 3000
download_all(mids, limit=MAX_MATCHES_TO_DOWNLOAD)


Downloading matches & timelines: 100%|██████████| 3000/3000 [1:47:02<00:00,  2.14s/it]


Download complete — Matches: 2650, Timelines: 2650, Missed TL: 0





(2650, 2650, 0)

# **Converting Raw Match JSONs Into a Clean Player-Level Stats Table**

I load each match JSON and pull out all relevant per-player statistics, building a tidy table of kills, damage, vision actions, and other metrics. This gives me a consistent player-level dataset that I can reuse for team aggregation and later modeling.

In [None]:
PLAYER_OUT = PROC_DIR / "player_stats.csv"
PLAYER_OUT.parent.mkdir(parents=True, exist_ok=True)

def flatten_matches():
    """
    Flatten all match JSONs into player-level records.
    """
    rows = []

    for p in tqdm(list(RAW_MATCH_DIR.glob("*.json")), desc="Flattening matches"):
        try:
            m = json.loads(p.read_text())
            meta = m.get("metadata", {})
            info = m.get("info", {})
            participants = info.get("participants", [])
            if not participants:
                continue

            match_id = meta.get("matchId")

            for pl in participants:
                rows.append({
                    "match_id": match_id,
                    "puuid": pl.get("puuid"),
                    "champion": pl.get("championName"),
                    "teamId": pl.get("teamId"),

                    "win": int(pl.get("win", 0)),
                    "kills": pl.get("kills", 0),
                    "deaths": pl.get("deaths", 0),
                    "assists": pl.get("assists", 0),

                    "largestkillingspree": pl.get("largestKillingSpree", 0),
                    "largestmultikill": pl.get("largestMultiKill", 0),

                    "doublekills": pl.get("doubleKills", 0),
                    "triplekills": pl.get("tripleKills", 0),
                    "quadrakills": pl.get("quadraKills", 0),
                    "pentakills": pl.get("pentaKills", 0),

                    "totdmgdealt": pl.get("totalDamageDealt", 0),
                    "magicdmgdealt": pl.get("magicDamageDealt", 0),
                    "physicaldmgdealt": pl.get("physicalDamageDealt", 0),
                    "truedmgdealt": pl.get("trueDamageDealt", 0),

                    "totdmgtochamp": pl.get("totalDamageDealtToChampions", 0),
                    "magicdmgtochamp": pl.get("magicDamageDealtToChampions", 0),
                    "physdmgtochamp": pl.get("physicalDamageDealtToChampions", 0),
                    "truedmgtochamp": pl.get("trueDamageDealtToChampions", 0),

                    "totheal": pl.get("totalHeal", 0),
                    "totunitshealed": pl.get("totalUnitsHealed", 0),

                    "dmgtoturrets": pl.get("damageDealtToTurrets", 0),
                    "timecc": pl.get("timeCCingOthers", 0),

                    "totdmgtaken": pl.get("totalDamageTaken", 0),
                    "magicdmgtaken": pl.get("magicDamageTaken", 0),
                    "physdmgtaken": pl.get("physicalDamageTaken", 0),
                    "truedmgtaken": pl.get("trueDamageTaken", 0),

                    "wardsplaced": pl.get("wardsPlaced", 0),
                    "wardskilled": pl.get("wardsKilled", 0),
                    "firstblood": int(pl.get("firstBloodKill", False)),
                })

        except Exception as e:
            print(f"Error in {p.name}: {e}")
            continue

    df = pd.DataFrame(rows)
    df.to_csv(PLAYER_OUT, index=False)
    print(f"Flattened {len(df)} player records → {PLAYER_OUT}")
    return df

player_df = flatten_matches()
len(player_df), player_df.head()


Flattening matches: 100%|██████████| 3000/3000 [00:02<00:00, 1265.33it/s]


Flattened 29980 player records → data/processed/player_stats.csv


(29980,
           match_id                                              puuid  \
 0  EUW1_7577975915  88_mK6tK6i-8P13fLI0HOvv1x1NdnIHEg_QxokNTOwUyZE...   
 1  EUW1_7577975915  2Mk8P4WfrCQI-2iJYRzjZhmBNWGmXZHu8Z01HfQ3YFAdbe...   
 2  EUW1_7577975915  8hASjRaXWi8KtTXsxIkS1UNNnL_tQg41nRIqhMtk0-U_EL...   
 3  EUW1_7577975915  KBQ-MU4ewl1iWt3XYNrUUQxHW3FfjzyYrhWl5f3Tbm9Cfr...   
 4  EUW1_7577975915  PtZ4FtqxeZ0qGDjXcnfe5zSLnNg2rsOPaEsrjK2m6g6kt7...   
 
    champion  teamId  win  kills  deaths  assists  largestkillingspree  \
 0     Jayce     100    0      1       0        0                    0   
 1    Graves     100    0      0       3        0                    0   
 2    Syndra     100    0      1       2        0                    0   
 3  Aphelios     100    0      0       4        0                    0   
 4     Taric     100    0      0       5        1                    0   
 
    largestmultikill  ...  totunitshealed  dmgtoturrets  timecc  totdmgtaken  \
 0                 1

# **Transforming Timeline Files into a Detailed Event-Level Dataset**

I read each timeline JSON and extract every event, including kills, wards, objectives, building takedowns, and item actions, along with their timestamps and positions. This gives me a full event-by-event view of each match that I can later aggregate into meaningful temporal features.

In [None]:
EVENTS_OUT = PROC_DIR / "events.csv"
EVENTS_OUT.parent.mkdir(parents=True, exist_ok=True)

def flatten_events():
    """
    Flatten timeline JSONs into an event-level table with rich micro info:
    positions, wards, items, objectives, etc.
    """
    rows = []
    for p in tqdm(list(RAW_TL_DIR.glob("*.json")), desc="Flattening events"):
        try:
            tl = json.loads(p.read_text())
            mid = tl.get("metadata", {}).get("matchId")
            if not mid:
                continue

            for fr in tl.get("info", {}).get("frames", []):
                frame_ts = int((fr.get("timestamp") or 0) // 1000)

                for ev in fr.get("events", []):
                    t = int((ev.get("timestamp") or frame_ts) // 1000)
                    pos = ev.get("position") or {}

                    rows.append({
                        "match_id": mid,
                        "t": t,
                        "type": ev.get("type"),

                        "teamId": ev.get("teamId"),
                        "killerId": ev.get("killerId"),
                        "victimId": ev.get("victimId"),

                        "participantId": ev.get("participantId"),
                        "assisting_participants": ",".join(
                            str(x) for x in (ev.get("assistingParticipantIds") or [])
                        ),

                        "monsterType": ev.get("monsterType"),
                        "monsterSubType": ev.get("monsterSubType"),
                        "buildingType": ev.get("buildingType"),
                        "towerType": ev.get("towerType"),
                        "laneType": ev.get("laneType"),

                        "wardType": ev.get("wardType"),
                        "creatorId": ev.get("creatorId"),

                        "itemId": ev.get("itemId"),

                        "position_x": pos.get("x"),
                        "position_y": pos.get("y"),
                    })

        except Exception as e:
            print(f"Error in {p.name}: {e}")
            continue

    df = pd.DataFrame(rows)
    df.to_csv(EVENTS_OUT, index=False)
    print(f"Flattened {len(df)} events → {EVENTS_OUT}")
    return df

events_df = flatten_events()
len(events_df)


Flattening events: 100%|██████████| 3000/3000 [00:21<00:00, 136.80it/s]


Flattened 3129240 events → data/processed/events.csv


3129240

# **Constructing a 10-Second Micro Timeline With Positional and Objective Features**

I rebuild each match into a dense 10-second timeline that tracks gold, XP, positioning, grouping, objective control, and recent combat activity for both teams. This gives me a rich temporal representation of the game state that I can later use to detect early signals before Baron, Dragon, or teamfight events.

In [None]:
import numpy as np
import math

TIMELINE_OUT = PROC_DIR / "timeline_10s_micro.csv"

BARON_POS = (5000, 10500)
DRAGON_POS = (9850, 4400)

def safe_get(d, *keys, default=0):
    """Safe nested dictionary get."""
    for k in keys:
        if not isinstance(d, dict) or k not in d:
            return default
        d = d[k]
    return d

def dist(a, b):
    return math.dist(a, b)

def near(pos, pts, r=2500):
    return sum(dist(pos, p) < r for p in pts)

def cluster_density(pts):
    if len(pts) < 2:
        return 9999.0
    xs = [p[0] for p in pts]
    ys = [p[1] for p in pts]
    return float((max(xs) - min(xs)) + (max(ys) - min(ys)))

def grouping_score(pts):
    if len(pts) < 2:
        return 0.0
    return float(sum(
        dist(pts[i], pts[j]) < 1500
        for i in range(len(pts))
        for j in range(i + 1, len(pts))
    ))

def team_spread(pts):
    """Average distance from team centroid."""
    if not pts:
        return 0.0
    c = np.mean(pts, axis=0)
    return float(np.mean([dist(p, c) for p in pts]))

def timeline_10s(mid, tl, events_df, tick=10):
    """
    Build a 10s timeline with rich micro features.
    """
    match = json.loads((RAW_MATCH_DIR / f"{mid}.json").read_text())
    dur = int(match["info"].get("gameDuration", 0))
    if dur <= 0:
        return None

    ts = np.arange(0, dur + 1, tick)
    df = pd.DataFrame({"match_id": mid, "t": ts})

    feat_cols = [
        "gold_blue", "gold_red", "xp_blue", "xp_red",
        "gold_diff", "xp_diff",

        "cs_lane_t1", "cs_lane_t2",
        "cs_jungle_t1", "cs_jungle_t2",
        "cs_total_t1", "cs_total_t2", "cs_total_diff",
        "avg_level_t1", "avg_level_t2", "level_diff",

        "alive_t1", "alive_t2", "alive_diff",
        "low_hp_t1", "low_hp_t2",

        "group_t1", "group_t2",
        "cluster_density", "distance_between_teams",
        "spread_t1", "spread_t2",
        "champs_near_baron_t1", "champs_near_baron_t2",
        "champs_near_dragon_t1", "champs_near_dragon_t2",

        "kills_30_t1", "kills_30_t2",
        "deaths_30_t1", "deaths_30_t2",
        "kills_60_t1", "kills_60_t2",
        "deaths_60_t1", "deaths_60_t2",

        "wards_placed_60_t1", "wards_placed_60_t2",
        "wards_killed_60_t1", "wards_killed_60_t2",
        "items_purchased_60_t1", "items_purchased_60_t2",

        "dragons_t1", "dragons_t2",
        "barons_t1", "barons_t2",
        "heralds_t1", "heralds_t2",
        "towers_t1", "towers_t2",
        "inhibs_t1", "inhibs_t2",

        "time_since_last_baron",
        "time_since_last_dragon",
        "time_since_last_teamfight",

        "teamfight_recent_20",
    ]
    for col in feat_cols:
        df[col] = 0.0

    ev_all = events_df[events_df.match_id == mid].copy()
    ev_all = ev_all.sort_values("t")

    ev_kill   = ev_all[ev_all.type == "CHAMPION_KILL"].copy()
    ev_ward_p = ev_all[ev_all.type == "WARD_PLACED"].copy()
    ev_ward_k = ev_all[ev_all.type == "WARD_KILL"].copy()
    ev_item_p = ev_all[ev_all.type == "ITEM_PURCHASED"].copy()
    ev_obj    = ev_all[ev_all.type == "ELITE_MONSTER_KILL"].copy()
    ev_build  = ev_all[ev_all.type == "BUILDING_KILL"].copy()

    # Frame-based features
    for fr in tl["info"]["frames"]:
        ft = int(fr.get("timestamp", 0) / 1000)
        pf = fr.get("participantFrames", {})
        if not pf:
            continue

        t1_ids = range(1, 6)
        t2_ids = range(6, 11)

        def agg_team(ids, key, subkey=None):
            vals = []
            for i in ids:
                p = pf.get(str(i), {})
                if subkey is None:
                    vals.append(p.get(key, 0))
                else:
                    vals.append(safe_get(p, key, subkey, default=0))
            return vals

        gold_t1 = sum(agg_team(t1_ids, "totalGold"))
        gold_t2 = sum(agg_team(t2_ids, "totalGold"))
        xp_t1   = sum(agg_team(t1_ids, "xp"))
        xp_t2   = sum(agg_team(t2_ids, "xp"))

        cs_lane_t1   = sum(agg_team(t1_ids, "minionsKilled"))
        cs_lane_t2   = sum(agg_team(t2_ids, "minionsKilled"))
        cs_jungle_t1 = sum(agg_team(t1_ids, "jungleMinionsKilled"))
        cs_jungle_t2 = sum(agg_team(t2_ids, "jungleMinionsKilled"))
        level_t1     = sum(agg_team(t1_ids, "level"))
        level_t2     = sum(agg_team(t2_ids, "level"))

        positions = {}
        for pid, p in pf.items():
            pos = p.get("position")
            if pos:
                positions[int(pid)] = (pos["x"], pos["y"])

        t1_pts = [positions[i] for i in t1_ids if i in positions]
        t2_pts = [positions[i] for i in t2_ids if i in positions]

        t1_alive = sum(
            safe_get(pf, str(i), "championStats", "currentHealth", default=0) > 0
            for i in t1_ids
        )
        t2_alive = sum(
            safe_get(pf, str(i), "championStats", "currentHealth", default=0) > 0
            for i in t2_ids
        )
        t1_lowhp = sum(
            safe_get(pf, str(i), "championStats", "currentHealth", default=0) < 200
            for i in t1_ids
        )
        t2_lowhp = sum(
            safe_get(pf, str(i), "championStats", "currentHealth", default=0) < 200
            for i in t2_ids
        )

        mask = df.t >= ft
        df.loc[mask, "gold_blue"] = gold_t1
        df.loc[mask, "gold_red"]  = gold_t2
        df.loc[mask, "xp_blue"]   = xp_t1
        df.loc[mask, "xp_red"]    = xp_t2

        df.loc[mask, "cs_lane_t1"]   = cs_lane_t1
        df.loc[mask, "cs_lane_t2"]   = cs_lane_t2
        df.loc[mask, "cs_jungle_t1"] = cs_jungle_t1
        df.loc[mask, "cs_jungle_t2"] = cs_jungle_t2
        df.loc[mask, "cs_total_t1"]  = cs_lane_t1 + cs_jungle_t1
        df.loc[mask, "cs_total_t2"]  = cs_lane_t2 + cs_jungle_t2
        df.loc[mask, "cs_total_diff"] = (cs_lane_t1 + cs_jungle_t1) - (cs_lane_t2 + cs_jungle_t2)

        avg_level_t1 = level_t1 / max(len(t1_ids), 1)
        avg_level_t2 = level_t2 / max(len(t2_ids), 1)
        df.loc[mask, "avg_level_t1"] = avg_level_t1
        df.loc[mask, "avg_level_t2"] = avg_level_t2
        df.loc[mask, "level_diff"]   = avg_level_t1 - avg_level_t2

        df.loc[mask, "alive_t1"]    = t1_alive
        df.loc[mask, "alive_t2"]    = t2_alive
        df.loc[mask, "alive_diff"]  = t1_alive - t2_alive
        df.loc[mask, "low_hp_t1"]   = t1_lowhp
        df.loc[mask, "low_hp_t2"]   = t2_lowhp

        df.loc[mask, "champs_near_baron_t1"]  = near(BARON_POS, t1_pts)
        df.loc[mask, "champs_near_baron_t2"]  = near(BARON_POS, t2_pts)
        df.loc[mask, "champs_near_dragon_t1"] = near(DRAGON_POS, t1_pts)
        df.loc[mask, "champs_near_dragon_t2"] = near(DRAGON_POS, t2_pts)

        df.loc[mask, "group_t1"] = grouping_score(t1_pts)
        df.loc[mask, "group_t2"] = grouping_score(t2_pts)

        df.loc[mask, "cluster_density"] = (
            cluster_density(t1_pts) + cluster_density(t2_pts)
        ) / 2.0

        if t1_pts and t2_pts:
            t1c = np.mean(t1_pts, axis=0)
            t2c = np.mean(t2_pts, axis=0)
            df.loc[mask, "distance_between_teams"] = dist(t1c, t2c)
        else:
            df.loc[mask, "distance_between_teams"] = 8000.0

        df.loc[mask, "spread_t1"] = team_spread(t1_pts)
        df.loc[mask, "spread_t2"] = team_spread(t2_pts)

    df["gold_diff"] = df["gold_blue"] - df["gold_red"]
    df["xp_diff"]   = df["xp_blue"]   - df["xp_red"]

    baron_times  = ev_obj[ev_obj.monsterType == "BARON_NASHOR"]["t"].tolist()
    dragon_times = ev_obj[ev_obj.monsterType == "DRAGON"]["t"].tolist()

    kill_times = ev_kill["t"].sort_values().tolist()
    tf_times = []
    for i, t0 in enumerate(kill_times):
        count = sum(1 for x in kill_times if t0 <= x <= t0 + 10)
        if count >= 3:
            tf_times.append(t0)

    for i, row in df.iterrows():
        ts = row.t

        recent20_kills = ev_kill[(ev_kill.t > ts - 20) & (ev_kill.t <= ts)]
        kills30 = ev_kill[(ev_kill.t > ts - 30) & (ev_kill.t <= ts)]
        kills60 = ev_kill[(ev_kill.t > ts - 60) & (ev_kill.t <= ts)]

        def team1_mask(series):
            return (series >= 1) & (series <= 5)

        k30_killer = kills30["killerId"].fillna(0).astype(int)
        k30_victim = kills30["victimId"].fillna(0).astype(int)
        k60_killer = kills60["killerId"].fillna(0).astype(int)
        k60_victim = kills60["victimId"].fillna(0).astype(int)

        df.at[i, "kills_30_t1"]   = (team1_mask(k30_killer)).sum()
        df.at[i, "kills_30_t2"]   = (~team1_mask(k30_killer) & (k30_killer > 0)).sum()
        df.at[i, "deaths_30_t1"]  = (team1_mask(k30_victim)).sum()
        df.at[i, "deaths_30_t2"]  = (~team1_mask(k30_victim) & (k30_victim > 0)).sum()

        df.at[i, "kills_60_t1"]   = (team1_mask(k60_killer)).sum()
        df.at[i, "kills_60_t2"]   = (~team1_mask(k60_killer) & (k60_killer > 0)).sum()
        df.at[i, "deaths_60_t1"]  = (team1_mask(k60_victim)).sum()
        df.at[i, "deaths_60_t2"]  = (~team1_mask(k60_victim) & (k60_victim > 0)).sum()

        wp60 = ev_ward_p[(ev_ward_p.t > ts - 60) & (ev_ward_p.t <= ts)]
        wk60 = ev_ward_k[(ev_ward_k.t > ts - 60) & (ev_ward_k.t <= ts)]
        ip60 = ev_item_p[(ev_item_p.t > ts - 60) & (ev_item_p.t <= ts)]

        wp_creator = wp60["creatorId"].fillna(0).astype(int)
        wk_killer  = wk60["killerId"].fillna(0).astype(int)
        ip_part    = ip60["participantId"].fillna(0).astype(int)

        df.at[i, "wards_placed_60_t1"] = (team1_mask(wp_creator)).sum()
        df.at[i, "wards_placed_60_t2"] = (~team1_mask(wp_creator) & (wp_creator > 0)).sum()
        df.at[i, "wards_killed_60_t1"] = (team1_mask(wk_killer)).sum()
        df.at[i, "wards_killed_60_t2"] = (~team1_mask(wk_killer) & (wk_killer > 0)).sum()

        df.at[i, "items_purchased_60_t1"] = (team1_mask(ip_part)).sum()
        df.at[i, "items_purchased_60_t2"] = (~team1_mask(ip_part) & (ip_part > 0)).sum()

        past_obj = ev_obj[ev_obj.t <= ts]
        team_ids = past_obj["teamId"].fillna(0).astype(int)

        is_dragon = past_obj["monsterType"] == "DRAGON"
        is_baron  = past_obj["monsterType"] == "BARON_NASHOR"
        is_herald = past_obj["monsterType"] == "RIFTHERALD"

        df.at[i, "dragons_t1"] = ((team_ids == 100) & is_dragon).sum()
        df.at[i, "dragons_t2"] = ((team_ids == 200) & is_dragon).sum()
        df.at[i, "barons_t1"]  = ((team_ids == 100) & is_baron).sum()
        df.at[i, "barons_t2"]  = ((team_ids == 200) & is_baron).sum()
        df.at[i, "heralds_t1"] = ((team_ids == 100) & is_herald).sum()
        df.at[i, "heralds_t2"] = ((team_ids == 200) & is_herald).sum()

        past_build = ev_build[ev_build.t <= ts]
        b_team = past_build["teamId"].fillna(0).astype(int)
        is_tower = past_build["buildingType"] == "TOWER_BUILDING"
        is_inhib = past_build["buildingType"] == "INHIBITOR_BUILDING"

        df.at[i, "towers_t1"] = ((b_team == 200) & is_tower).sum()
        df.at[i, "towers_t2"] = ((b_team == 100) & is_tower).sum()
        df.at[i, "inhibs_t1"] = ((b_team == 200) & is_inhib).sum()
        df.at[i, "inhibs_t2"] = ((b_team == 100) & is_inhib).sum()

        def time_since(last_times):
            if not last_times:
                return float(9999)
            past = [x for x in last_times if x <= ts]
            if not past:
                return float(9999)
            return float(ts - max(past))

        df.at[i, "time_since_last_baron"]  = time_since(baron_times)
        df.at[i, "time_since_last_dragon"] = time_since(dragon_times)
        df.at[i, "time_since_last_teamfight"] = time_since(tf_times)

        df.at[i, "teamfight_recent_20"] = int(len(recent20_kills) >= 3)

    return df

def build_timelines(events_df):
    parts = []
    for p in tqdm(list(RAW_TL_DIR.glob("*.json")), desc="Building timelines 10s"):
        tl = json.loads(p.read_text())
        mid = tl["metadata"]["matchId"]
        part = timeline_10s(mid, tl, events_df)
        if part is not None:
            parts.append(part)
    df = pd.concat(parts, ignore_index=True)
    df.to_csv(TIMELINE_OUT, index=False)
    print(f"Micro-rich timeline saved → {TIMELINE_OUT}")
    return df

timeline_df = build_timelines(events_df)
timeline_df.head()


Building timelines 10s: 100%|██████████| 3000/3000 [1:14:31<00:00,  1.49s/it]


Micro-rich timeline saved → data/processed/timeline_10s_micro.csv


Unnamed: 0,match_id,t,gold_blue,gold_red,xp_blue,xp_red,gold_diff,xp_diff,cs_lane_t1,cs_lane_t2,...,heralds_t1,heralds_t2,towers_t1,towers_t2,inhibs_t1,inhibs_t2,time_since_last_baron,time_since_last_dragon,time_since_last_teamfight,teamfight_recent_20
0,EUW1_7577975915,0,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9999.0,9999.0,9999.0,0.0
1,EUW1_7577975915,10,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9999.0,9999.0,9999.0,0.0
2,EUW1_7577975915,20,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9999.0,9999.0,9999.0,0.0
3,EUW1_7577975915,30,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9999.0,9999.0,9999.0,0.0
4,EUW1_7577975915,40,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9999.0,9999.0,9999.0,0.0


# **Labeling Future Objective Events and Building the Final Model-Ready Dataset**

I take the micro-timelines and tag each timestamp with whether a Baron, Dragon, or teamfight will occur soon, then merge in team-level aggregated stats. This produces the complete dataset I’ll use for training early-warning models, with every row representing the game state plus its future objective outcome.

In [None]:
DATA = PROC_DIR

timeline = pd.read_csv(DATA / "timeline_10s_micro.csv")
events   = pd.read_csv(DATA / "events.csv")
player   = pd.read_csv(DATA / "player_stats.csv")

def extract_objective_times(events):
    obj = {}
    for mid, df in events.groupby("match_id"):
        baron  = df[
            (df.type == "ELITE_MONSTER_KILL")
            & (df.monsterType == "BARON_NASHOR")
        ]["t"].tolist()

        dragon = df[
            (df.type == "ELITE_MONSTER_KILL")
            & (df.monsterType == "DRAGON")
        ]["t"].tolist()

        kills = df[df.type == "CHAMPION_KILL"]["t"].sort_values().tolist()
        fights = []
        for i, t in enumerate(kills):
            if sum(1 for x in kills if t <= x <= t + 10) >= 3:
                fights.append(t)

        obj[mid] = {"baron": baron, "dragon": dragon, "teamfight": fights}
    return obj

objective_times = extract_objective_times(events)

WINDOWS = [10, 20, 30]
EVENTS  = ["baron", "dragon", "teamfight"]

def label_future(ts, event_list, w):
    return int(any(ts < e <= ts + w for e in event_list))

rows = []
for mid, df in timeline.groupby("match_id"):
    df = df.copy()
    obj = objective_times.get(mid, {"baron": [], "dragon": [], "teamfight": []})

    for ev in EVENTS:
        for w in WINDOWS:
            df[f"y_{ev}_{w}"] = df["t"].apply(lambda t: label_future(t, obj[ev], w))
    rows.append(df)

labeled_timeline = pd.concat(rows, ignore_index=True)

player["team"] = player.groupby("match_id").cumcount().apply(
    lambda x: "blue" if x < 5 else "red"
)

team_stats = (
    player.groupby(["match_id", "team"])
    .agg({
        "kills": "sum",
        "deaths": "sum",
        "assists": "sum",
        "totdmgdealt": "mean",
        "totdmgtochamp": "mean",
        "totheal": "mean",
        "wardsplaced": "mean",
        "wardskilled": "mean",
    })
    .reset_index()
)

team_stats = team_stats.pivot(index="match_id", columns="team")
team_stats.columns = [f"{a}_{b}" for a, b in team_stats.columns]
team_stats.reset_index(inplace=True)

final = (
    labeled_timeline
    .merge(team_stats, on="match_id", how="left")
    .sort_values(["match_id", "t"])
)

FINAL_CSV = DATA / "final_dataset.csv"
final.to_csv(FINAL_CSV, index=False)
print(final.shape, ":", FINAL_CSV)
final.head()


(484255, 86) → data/processed/final_dataset.csv


Unnamed: 0,match_id,t,gold_blue,gold_red,xp_blue,xp_red,gold_diff,xp_diff,cs_lane_t1,cs_lane_t2,...,totdmgdealt_blue,totdmgdealt_red,totdmgtochamp_blue,totdmgtochamp_red,totheal_blue,totheal_red,wardsplaced_blue,wardsplaced_red,wardskilled_blue,wardskilled_red
0,EUW1_7419887159,0,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,161175.6,145130.2,25959.6,20247.6,9176.6,6589.0,16.4,15.6,3.2,3.0
1,EUW1_7419887159,10,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,161175.6,145130.2,25959.6,20247.6,9176.6,6589.0,16.4,15.6,3.2,3.0
2,EUW1_7419887159,20,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,161175.6,145130.2,25959.6,20247.6,9176.6,6589.0,16.4,15.6,3.2,3.0
3,EUW1_7419887159,30,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,161175.6,145130.2,25959.6,20247.6,9176.6,6589.0,16.4,15.6,3.2,3.0
4,EUW1_7419887159,40,2500.0,2500.0,0.0,0.0,0.0,0.0,0.0,0.0,...,161175.6,145130.2,25959.6,20247.6,9176.6,6589.0,16.4,15.6,3.2,3.0
