# **Initialize API Client and Directories**

Loads environment variables, initializes the RiotWatcher client with the API key, and prepares folders for storing raw and processed game data. Sets constants for region, game mode, and the time period for collecting ranked solo queue matches.

In [None]:
import os
import json
import time
import pandas as pd
from tqdm import tqdm
from riotwatcher import LolWatcher, ApiError
from datetime import datetime, timezone
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()
API_KEY = "RGAPI-99fc4ea1-5b13-49f4-9aa8-0acfa99546b7"
assert API_KEY, "Please set RIOT_API_KEY in your .env file."

watcher = LolWatcher(API_KEY)
REGION = "europe"
PLATFORM = "euw1"
QUEUE_ID = 420  # Ranked Solo
START = int(datetime(2024,1,1,tzinfo=timezone.utc).timestamp())
END   = int(datetime(2025,12,31,23,59,59,tzinfo=timezone.utc).timestamp())
DATA_DIR = Path("data")
RAW_MATCH_DIR = DATA_DIR / "raw/matches"
RAW_TL_DIR = DATA_DIR / "raw/timelines"
PROC_DIR = DATA_DIR / "processed"
for d in [RAW_MATCH_DIR, RAW_TL_DIR, PROC_DIR]: d.mkdir(parents=True, exist_ok=True)

# **Retrieve Top Player PUUIDs**

Fetches player unique identifiers (PUUIDs) from the challenger, grandmaster, or master ranked ladders for the European server. Limits the number of PUUIDs collected and ensures uniqueness, providing a high-skill player base for subsequent match data collection.

In [None]:
def get_seed_puuids(limit=200):
    """
    Fetch challenger (or fallback grandmaster/master) ladder entries and extract PUUIDs directly.
    Compatible with RiotWatcher versions returning 'puuid' inside ladder entries.
    """
    tiers = [
        ("challenger_by_queue", watcher.league.challenger_by_queue),
        ("grandmaster_by_queue", watcher.league.grandmaster_by_queue),
        ("masters_by_queue", watcher.league.masters_by_queue)
    ]

    entries = []
    for name, func in tiers:
        try:
            data = func("euw1", "RANKED_SOLO_5x5")
            ents = data.get("entries", [])
            if ents:
                print(f"Found {len(ents)} entries in {name}")
                entries = ents
                break
        except Exception as e:
            print(f"Failed {name}: {e}")
            continue

    if not entries:
        print("No ladder entries found — check API key or region.")
        return []

    puuids = [e["puuid"] for e in entries[:limit] if "puuid" in e]
    puuids = list(dict.fromkeys(puuids))

    print(f"Collected {len(puuids)} unique PUUIDs directly from ladder")
    return puuids


seed_puuids = get_seed_puuids(100)
len(seed_puuids)


# **Collect Match IDs for Seed Players**

Loops through each player's PUUID to retrieve match IDs from ranked solo queue games within the targeted time period. Handles API rate limits and errors, consolidates unique match references, and saves them to a CSV for later downloading of full match data.

In [None]:
from pathlib import Path
import pandas as pd
import time
from tqdm import tqdm
from riotwatcher import ApiError

DATA_DIR = Path("data/interim")
DATA_DIR.mkdir(parents=True, exist_ok=True)

QUEUE_ID = 420 
START = int(datetime(2024, 1, 1, tzinfo=timezone.utc).timestamp())
END   = int(datetime(2025, 12, 31, 23, 59, 59, tzinfo=timezone.utc).timestamp())

def get_match_ids(puuids, pages=2, per_page=100):
    """
    Fetch recent match IDs for each player PUUID.
    """
    recs = []

    for puuid in tqdm(puuids, desc="Fetching match IDs"):
        for p in range(pages):
            try:
                mids = watcher._match.matchlist_by_puuid(
                    "europe", 
                    puuid,
                    start=p * per_page,
                    count=per_page,
                    queue=QUEUE_ID,
                    start_time=START,
                    end_time=END
                )
                if not mids:
                    break
                recs += [{"puuid": puuid, "match_id": mid} for mid in mids]
                time.sleep(0.1)
            except ApiError as e:
                if getattr(e.response, "status_code", None) == 429:
                    print("Rate limit hit, sleeping 2s...")
                    time.sleep(2)
                break
            except Exception as ex:
                print(f"Error for {puuid}: {ex}")
                break

    df = pd.DataFrame(recs).drop_duplicates()
    df.to_csv(DATA_DIR / "match_ids.csv", index=False)
    print(f"Saved {len(df)} match records to {DATA_DIR / 'match_ids.csv'}")
    return df


match_ids_df = get_match_ids(seed_puuids, pages=3)
len(match_ids_df)


# **Download Match and Timeline Data with Retry**

Fetches full match details and timelines for each match ID, storing them as JSON files with caching to avoid redundant downloads. Implements automatic retries with exponential backoff to handle API rate limits and failures, ensuring robust data collection for analysis.

In [None]:
import json
from pathlib import Path
from tqdm import tqdm
from riotwatcher import ApiError
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

RAW_MATCH_DIR = Path("data/raw/matches")
RAW_TL_DIR = Path("data/raw/timelines")
RAW_MATCH_DIR.mkdir(parents=True, exist_ok=True)
RAW_TL_DIR.mkdir(parents=True, exist_ok=True)

@retry(retry=retry_if_exception_type(ApiError), wait=wait_exponential(min=1, max=60), stop=stop_after_attempt(5), reraise=True)
def fetch_match(mid):
    """Fetch match details using the new RiotWatcher structure."""
    return watcher._match.by_id("europe", mid)

@retry(retry=retry_if_exception_type(ApiError), wait=wait_exponential(min=1, max=60), stop=stop_after_attempt(5), reraise=True)
def fetch_timeline(mid):
    """Fetch timeline data using the new RiotWatcher structure."""
    return watcher._match.timeline_by_match("europe", mid)

def download_all(match_ids):
    """
    Download all matches and timelines, with retry + caching.
    """
    ok_m = ok_tl = miss_tl = 0
    for mid in tqdm(match_ids, desc="Downloading matches & timelines"):
        m_path = RAW_MATCH_DIR / f"{mid}.json"
        tl_path = RAW_TL_DIR / f"{mid}.json"

        # Match JSON
        if not m_path.exists():
            try:
                m_data = fetch_match(mid)
                m_path.write_text(json.dumps(m_data))
                ok_m += 1
            except ApiError as e:
                print(f"Match fetch failed for {mid}: {getattr(e.response, 'status_code', '?')}")
                time.sleep(2)

        
        if not tl_path.exists():
            try:
                tl_data = fetch_timeline(mid)
                tl_path.write_text(json.dumps(tl_data))
                ok_tl += 1
            except ApiError:
                miss_tl += 1
                continue

        time.sleep(0.25) 

    print(f"\nDownload complete — Matches: {ok_m}, Timelines: {ok_tl}, Missed: {miss_tl}")
    return ok_m, ok_tl, miss_tl


# ⚙️ Run in batches (to avoid hitting the rate limit hard)
mids = match_ids_df["match_id"].unique().tolist()
download_all(mids[:500])  


# **Extract Player Stats from Matches**

Reads all raw match JSON files and converts detailed player statistics into a single flat table. Includes key stats like kills, deaths, damage, healing, and objectives to form the basis for player performance analysis.

In [None]:
import json
import pandas as pd
from tqdm import tqdm
from pathlib import Path

RAW_MATCH_DIR = Path("data/raw/matches")
OUT = Path("data/processed/player_stats.csv")
OUT.parent.mkdir(parents=True, exist_ok=True)

def flatten_matches():
    """
    Flatten all match JSONs into player-level records.
    Compatible with RiotWatcher v3.3+ and your match JSON schema.
    """
    rows = []

    for p in tqdm(list(RAW_MATCH_DIR.glob("*.json")), desc="Flattening matches"):
        try:
            m = json.loads(p.read_text())
            info = m.get("info", {})
            participants = info.get("participants", [])
            if not participants:
                continue

            for pl in participants:
                rows.append({
                    "match_id": m.get("metadata", {}).get("matchId"),
                    "id": pl.get("puuid"),
                    "win": int(pl.get("win", 0)),
                    "kills": pl.get("kills", 0),
                    "deaths": pl.get("deaths", 0),
                    "assists": pl.get("assists", 0),
                    "largestkillingspree": pl.get("largestKillingSpree", 0),
                    "largestmultikill": pl.get("largestMultiKill", 0),
                    "longesttimespentliving": pl.get("longestTimeSpentLiving", 0),
                    "doublekills": pl.get("doubleKills", 0),
                    "triplekills": pl.get("tripleKills", 0),
                    "quadrakills": pl.get("quadraKills", 0),
                    "pentakills": pl.get("pentaKills", 0),
                    "totdmgdealt": pl.get("totalDamageDealt", 0),
                    "magicdmgdealt": pl.get("magicDamageDealt", 0),
                    "physicaldmgdealt": pl.get("physicalDamageDealt", 0),
                    "truedmgdealt": pl.get("trueDamageDealt", 0),
                    "largestcrit": pl.get("largestCriticalStrike", 0),
                    "totdmgtochamp": pl.get("totalDamageDealtToChampions", 0),
                    "magicdmgtochamp": pl.get("magicDamageDealtToChampions", 0),
                    "physdmgtochamp": pl.get("physicalDamageDealtToChampions", 0),
                    "truedmgtochamp": pl.get("trueDamageDealtToChampions", 0),
                    "totheal": pl.get("totalHeal", 0),
                    "totunitshealed": pl.get("totalUnitsHealed", 0),
                    "dmgtoturrets": pl.get("damageDealtToTurrets", 0),
                    "timecc": pl.get("timeCCingOthers", 0),
                    "totdmgtaken": pl.get("totalDamageTaken", 0),
                    "magicdmgtaken": pl.get("magicDamageTaken", 0),
                    "physdmgtaken": pl.get("physicalDamageTaken", 0),
                    "truedmgtaken": pl.get("trueDamageTaken", 0),
                    "wardsplaced": pl.get("wardsPlaced", 0),
                    "wardskilled": pl.get("wardsKilled", 0),
                    "firstblood": int(pl.get("firstBloodKill", False))
                })

        except Exception as e:
            print(f"Error in {p.name}: {e}")
            continue

    df = pd.DataFrame(rows)
    df.to_csv(OUT, index=False)
    print(f"\nFlattened {len(df)} player records → {OUT}")
    return df


player_df = flatten_matches()
len(player_df), player_df.head()


# **Extract and Flatten Match Events**

Processes timeline JSON files to collect all in-game events, such as kills, objectives, and item purchases, into a simple table with timestamps. This event-level data captures the flow of key actions across matches, forming the foundation for event prediction.

In [None]:
import json, pandas as pd
from pathlib import Path
from tqdm import tqdm

RAW_TL_DIR = Path("data/raw/timelines")
OUT = Path("data/processed/events.csv")

def flatten_events(mid, tl):
    rows = []
    for fr in tl.get("info", {}).get("frames", []):
        t = int((fr.get("timestamp") or 0) // 1000)
        for ev in fr.get("events", []):
            rows.append({
                "match_id": mid,
                "t": int((ev.get("timestamp") or t) // 1000),
                "type": ev.get("type"),
                "teamId": ev.get("teamId"),
                "killerId": ev.get("killerId"),
                "victimId": ev.get("victimId"),
                "monsterType": ev.get("monsterType"),
                "buildingType": ev.get("buildingType"),
                "laneType": ev.get("laneType"),
                "itemId": ev.get("itemId")
            })
    return rows


def process_events():
    rows = []
    for p in tqdm(RAW_TL_DIR.glob("*.json"), desc="Flattening events"):
        tl = json.loads(p.read_text())
        mid = tl.get("metadata", {}).get("matchId")
        if not mid:
            continue
        rows += flatten_events(mid, tl)

    df = pd.DataFrame(rows)
    df.to_csv(OUT, index=False)
    print(f"Flattened {len(df)} events → {OUT}")
    return df


events_df = process_events()
len(events_df)


# **Generate 10-Second Interval Team Timelines**

Processes raw timeline data to create team-level snapshots every 10 seconds, summarizing gold, experience, kills, player health, positioning, and clustering. Calculates metrics like team grouping, distance between teams, and presence near major objectives, providing a detailed dynamic game state for modeling.

In [None]:
import numpy as np
import math

RAW_TL_DIR = Path("data/raw/timelines")
RAW_MATCH_DIR = Path("data/raw/matches")
OUT = Path("data/processed/timeline_10s.csv")

BARON_POS = (5000, 10500)
DRAGON_POS = (9850, 4400)

def safe_get(d, *keys, default=0):
    """Safe nested dictionary get."""
    for k in keys:
        if not isinstance(d, dict) or k not in d:
            return default
        d = d[k]
    return d

def dist(a, b):
    return math.dist(a, b)

def near(pos, pts, r=2500):
    return sum(dist(pos,p) < r for p in pts)

def cluster_density(pts):
    if len(pts) < 2: return 9999
    xs = [p[0] for p in pts]
    ys = [p[1] for p in pts]
    return (max(xs)-min(xs)) + (max(ys)-min(ys))

def grouping_score(pts):
    if len(pts) < 2: return 0
    return sum(
        dist(pts[i], pts[j]) < 1500
        for i in range(len(pts))
        for j in range(i+1,len(pts))
    )

def timeline_10s(mid, tl, events_df, tick=10):
    match = json.loads((RAW_MATCH_DIR / f"{mid}.json").read_text())
    dur = int(match["info"].get("gameDuration", 0))
    if dur <= 0:
        return None

    ts = np.arange(0, dur+1, tick)
    df = pd.DataFrame({"match_id": mid, "t": ts})

    float_cols = [
        "cluster_density",
        "distance_between_teams",
        "gold_blue","gold_red","xp_blue","xp_red",
        "gold_diff","xp_diff",
        "kills_blue","kills_red",
        "kills_30_t1","kills_30_t2","deaths_30_t1","deaths_30_t2",
        "group_t1","group_t2",
        "alive_t1","alive_t2","alive_diff",
        "low_hp_t1","low_hp_t2",
        "champs_near_baron_t1","champs_near_baron_t2",
        "champs_near_dragon_t1","champs_near_dragon_t2"
    ]
    
    for col in float_cols:
        df[col] = df[col].astype("float64")


    
    for col in [
        "kills_blue","kills_red","gold_blue","gold_red","xp_blue","xp_red",
        "alive_t1","alive_t2","low_hp_t1","low_hp_t2",
        "champs_near_baron_t1","champs_near_baron_t2",
        "champs_near_dragon_t1","champs_near_dragon_t2",
        "group_t1","group_t2","cluster_density","distance_between_teams",
        "kills_30_t1","kills_30_t2","deaths_30_t1","deaths_30_t2",
        "gold_diff","xp_diff","alive_diff"
    ]:
        df[col] = 0

   
    all_events = events_df[events_df.match_id==mid]

  
    for fr in tl["info"]["frames"]:
        ft = int(fr.get("timestamp", 0) / 1000)

        pf = fr.get("participantFrames", {})
        if not pf:
            continue
            
        blue_pf = [v for k,v in pf.items() if int(k)<=5]
        red_pf  = [v for k,v in pf.items() if int(k)>5]

        df.loc[df.t>=ft,"gold_blue"] = sum(safe_get(pf,str(i),"totalGold",default=0) for i in range(1,6))
        df.loc[df.t>=ft,"gold_red"]  = sum(safe_get(pf,str(i),"totalGold",default=0) for i in range(6,11))
        df.loc[df.t>=ft,"xp_blue"]   = sum(safe_get(pf,str(i),"xp",default=0) for i in range(1,6))
        df.loc[df.t>=ft,"xp_red"]    = sum(safe_get(pf,str(i),"xp",default=0) for i in range(6,11))

        positions = {}
        for pid,p in pf.items():
            pos = p.get("position")
            if pos:
                positions[int(pid)] = (pos["x"], pos["y"])

        t1_pts = [positions[i] for i in range(1,6) if i in positions]
        t2_pts = [positions[i] for i in range(6,11) if i in positions]

        
        t1_alive = sum(safe_get(pf,str(i),"championStats","currentHealth", default=0) > 0 for i in range(1,6))
        t2_alive = sum(safe_get(pf,str(i),"championStats","currentHealth", default=0) > 0 for i in range(6,11))

        t1_lowhp = sum(safe_get(pf,str(i),"championStats","currentHealth", default=0) < 200 for i in range(1,6))
        t2_lowhp = sum(safe_get(pf,str(i),"championStats","currentHealth", default=0) < 200 for i in range(6,11))

        df.loc[df.t>=ft, "alive_t1"] = t1_alive
        df.loc[df.t>=ft, "alive_t2"] = t2_alive
        df.loc[df.t>=ft, "low_hp_t1"] = t1_lowhp
        df.loc[df.t>=ft, "low_hp_t2"] = t2_lowhp

        df.loc[df.t>=ft, "champs_near_baron_t1"] = near(BARON_POS, t1_pts)
        df.loc[df.t>=ft, "champs_near_baron_t2"] = near(BARON_POS, t2_pts)
        df.loc[df.t>=ft, "champs_near_dragon_t1"] = near(DRAGON_POS, t1_pts)
        df.loc[df.t>=ft, "champs_near_dragon_t2"] = near(DRAGON_POS, t2_pts)

        df.loc[df.t>=ft,"group_t1"] = grouping_score(t1_pts)
        df.loc[df.t>=ft,"group_t2"] = grouping_score(t2_pts)

        df.loc[df.t>=ft,"cluster_density"] = (cluster_density(t1_pts)+cluster_density(t2_pts))/2

        if t1_pts and t2_pts:
            t1c = np.mean(t1_pts,axis=0)
            t2c = np.mean(t2_pts,axis=0)
            df.loc[df.t>=ft,"distance_between_teams"] = dist(t1c,t2c)
        else:
            df.loc[df.t>=ft,"distance_between_teams"] = 8000

    for _,e in all_events.iterrows():
        if e.type=="CHAMPION_KILL":
            side = "blue" if e.killerId and int(e.killerId)<=5 else "red"
            df.loc[df.t>=e.t,f"kills_{side}"] += 1

    for i,row in df.iterrows():
        ts = row.t
        recent = all_events[(all_events.t>=ts-30) & (all_events.t<=ts)]

        df.at[i,"kills_30_t1"] = sum(ev.killerId and int(ev.killerId)<=5 for _,ev in recent.iterrows() if ev.type=="CHAMPION_KILL")
        df.at[i,"kills_30_t2"] = sum(ev.killerId and int(ev.killerId)>5  for _,ev in recent.iterrows() if ev.type=="CHAMPION_KILL")

        df.at[i,"deaths_30_t1"] = sum(ev.victimId and int(ev.victimId)<=5 for _,ev in recent.iterrows() if ev.type=="CHAMPION_KILL")
        df.at[i,"deaths_30_t2"] = sum(ev.victimId and int(ev.victimId)>5  for _,ev in recent.iterrows() if ev.type=="CHAMPION_KILL")

   
    df["gold_diff"] = df.gold_blue - df.gold_red
    df["xp_diff"]   = df.xp_blue   - df.xp_red
    df["alive_diff"] = df.alive_t1 - df.alive_t2

    return df


def build_timelines(events_df):
    parts = []
    for p in tqdm(RAW_TL_DIR.glob("*.json"), desc="Building timelines"):
        tl = json.loads(p.read_text())
        mid = tl["metadata"]["matchId"]
        part = timeline_10s(mid, tl, events_df)
        if part is not None:
            parts.append(part)
    df = pd.concat(parts, ignore_index=True)
    df.to_csv(OUT, index=False)
    print("Fixed timeline_10s saved")
    return df


timeline_df = build_timelines(events_df)
timeline_df.head()


# **Label Future Objectives and Merge Team Stats**

Marks upcoming objectives and fights within specific time windows and combines these labels with team-level player statistics. This produces a final comprehensive dataset ready for building predictive models.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

DATA = Path("data/processed")

timeline = pd.read_csv(DATA / "timeline_10s.csv")
events   = pd.read_csv(DATA / "events.csv")
player   = pd.read_csv(DATA / "player_stats.csv")

def extract_objective_times(events):
    obj = {}
    for mid,df in events.groupby("match_id"):
        baron  = df[(df.type=="ELITE_MONSTER_KILL") & (df.monsterType=="BARON_NASHOR")]["t"].tolist()
        dragon = df[(df.type=="ELITE_MONSTER_KILL") & (df.monsterType=="DRAGON")]["t"].tolist()

        kills = df[df.type=="CHAMPION_KILL"]["t"].sort_values().tolist()
        fights=[]
        for i,t in enumerate(kills):
            if sum(1 for x in kills if t <= x <= t+10) >=3:
                fights.append(t)

        obj[mid]={"baron":baron,"dragon":dragon,"teamfight":fights}
    return obj

objective_times = extract_objective_times(events)

WINDOWS=[10,20,30]
EVENTS=["baron","dragon","teamfight"]

def label_future(ts, event_list, w):
    return int(any(ts < e <= ts+w for e in event_list))

rows=[]
for mid,df in timeline.groupby("match_id"):
    df=df.copy()
    obj=objective_times.get(mid,{"baron":[],"dragon":[],"teamfight":[]})

    for ev in EVENTS:
        for w in WINDOWS:
            df[f"{ev}_{w}"]=df["t"].apply(lambda t: label_future(t,obj[ev],w))
    rows.append(df)

labeled_timeline = pd.concat(rows, ignore_index=True)

player["team"]=player.groupby("match_id").cumcount().apply(
    lambda x:"blue" if x<5 else "red"
)

team_stats = (
    player.groupby(["match_id","team"])
    .agg({
        "kills":"sum",
        "deaths":"sum",
        "assists":"sum",
        "totdmgdealt":"mean",
        "totdmgtochamp":"mean",
        "totheal":"mean",
        "wardsplaced":"mean",
        "wardskilled":"mean"
    })
    .reset_index()
)
team_stats = team_stats.pivot(index="match_id", columns="team")
team_stats.columns = [f"{a}_{b}" for a,b in team_stats.columns]
team_stats.reset_index(inplace=True)

final = (
    labeled_timeline
    .merge(team_stats,on="match_id",how="left")
    .sort_values(["match_id","t"])
)

final.to_csv(DATA / "final_dataset.csv", index=False)
print(final.shape)
final.head()
