# 02 - Filtrer >= 2025 et inspecter une game détaillée
Charge les matches bruts, filtre après le 1er janvier 2025, puis récupère le détail d'une game (objectives/series) pour debugger la représentation des séries/maps.

In [1]:
from pathlib import Path
import sys

def _find_root():
    candidates = [Path.cwd()] + list(Path.cwd().parents[:4])
    for cand in candidates:
        if (cand / "src").exists():
            return cand
    return Path.cwd()

ROOT = _find_root()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))
print(f"Using project root: {ROOT}")


Using project root: /home/ju/Documents/Dev/Dota-Datas


In [2]:
from datetime import datetime, timezone
from pathlib import Path
import json
import pandas as pd
from dotenv import load_dotenv

from src.dota_data.api import (
    load_api_key,
    build_session,
    load_team_list,
    fetch_team_matches,
    filter_matches_since,
    unique_match_ids,
    fetch_match_detail,
    annotate_matches_with_team,
    write_json,
)

load_dotenv()
api_key = load_api_key(load_env_file=False)
session = build_session(api_key)
teams = load_team_list(ROOT / "data/teams_to_look.csv")

raw_path = Path("data/interim/team_matches_raw.json")
if raw_path.exists():
    team_matches = json.loads(raw_path.read_text())
else:
    team_matches = []
    for team in teams:
        team_matches.append({"team": team, "matches": fetch_team_matches(team["TeamID"], session=session)})
        raw_path.parent.mkdir(parents=True, exist_ok=True)
        write_json(team_matches, raw_path)
print(f"Matches chargés depuis {raw_path} : {len(team_matches)} entrées")


Matches chargés depuis data/interim/team_matches_raw.json : 24 entrées


In [3]:
flat_rows = []
for entry in team_matches:
    flat_rows.extend(annotate_matches_with_team(entry["matches"], entry["team"]["TeamID"], entry["team"]["TeamName"]))
matches_df = pd.DataFrame(flat_rows)
if not matches_df.empty:
    matches_df["start_dt"] = pd.to_datetime(matches_df["start_time"], unit="s")
matches_df.head()


Unnamed: 0,match_id,radiant_win,radiant_score,dire_score,radiant,duration,start_time,leagueid,league_name,cluster,opposing_team_id,opposing_team_name,opposing_team_logo,_source_team_id,_source_team_name,start_dt
0,8547022523,True,23,19,False,2312,1762520179,17419,SLAM IV,153,9303484,HEROIC,https://cdn.steamusercontent.com/ugc/247198417...,9467224,Aurora Gaming,2025-11-07 12:56:19
1,8546927307,True,31,4,False,1995,1762515793,17419,SLAM IV,153,9303484,HEROIC,https://cdn.steamusercontent.com/ugc/247198417...,9467224,Aurora Gaming,2025-11-07 11:43:13
2,8531221816,True,23,20,False,2310,1761580800,18863,FISSURE PLAYGROUND 2,274,9338413,MOUZ,https://cdn.steamusercontent.com/ugc/149367842...,9467224,Aurora Gaming,2025-10-27 16:00:00
3,8531103673,False,13,31,False,2555,1761576420,18863,FISSURE PLAYGROUND 2,273,9338413,MOUZ,https://cdn.steamusercontent.com/ugc/149367842...,9467224,Aurora Gaming,2025-10-27 14:47:00
4,8531000461,True,23,12,False,1749,1761572880,18863,FISSURE PLAYGROUND 2,272,9338413,MOUZ,https://cdn.steamusercontent.com/ugc/149367842...,9467224,Aurora Gaming,2025-10-27 13:48:00


In [4]:
cutoff_ts = int(datetime(2025, 1, 1, tzinfo=timezone.utc).timestamp())
recent_rows = filter_matches_since(flat_rows, cutoff_ts)
recent_ids = unique_match_ids(recent_rows)
print(f"Matches >= 2025-01-01: {len(recent_rows)} (ids uniques: {len(recent_ids)})")
pd.DataFrame(recent_rows)[["match_id", "start_time", "_source_team_name"]].head()


Matches >= 2025-01-01: 5391 (ids uniques: 3879)


Unnamed: 0,match_id,start_time,_source_team_name
0,8547022523,1762520179,Aurora Gaming
1,8546927307,1762515793,Aurora Gaming
2,8531221816,1761580800,Aurora Gaming
3,8531103673,1761576420,Aurora Gaming
4,8531000461,1761572880,Aurora Gaming


In [5]:
# Duplicats de match_id (même game renvoyée par plusieurs équipes)

dup_df = matches_df[matches_df.duplicated('match_id', keep=False)] if not matches_df.empty else pd.DataFrame()
dup_counts = dup_df['match_id'].value_counts() if not dup_df.empty else pd.Series(dtype=int)
sample_dup_ids = dup_counts.head(3).index.tolist()
print(f"match_id dupliqués: {len(dup_counts)} (affiche les 3 premiers)")
if sample_dup_ids:
    display(dup_df[dup_df['match_id'].isin(sample_dup_ids)].sort_values(['match_id', '_source_team_name', 'start_time']))
else:
    print('Pas de duplicats trouvés')


match_id dupliqués: 3187 (affiche les 3 premiers)


Unnamed: 0,match_id,radiant_win,radiant_score,dire_score,radiant,duration,start_time,leagueid,league_name,cluster,opposing_team_id,opposing_team_name,opposing_team_logo,_source_team_id,_source_team_name,start_dt
2,8531221816,True,23,20,False,2310,1761580800,18863,FISSURE PLAYGROUND 2,274,9338413,MOUZ,https://cdn.steamusercontent.com/ugc/149367842...,9467224,Aurora Gaming,2025-10-27 16:00:00
14981,8531221816,True,23,20,True,2310,1761580800,18863,FISSURE PLAYGROUND 2,274,9467224,Aurora Gaming,https://cdn.steamusercontent.com/ugc/130525837...,9338413,MOUZ,2025-10-27 16:00:00
1,8546927307,True,31,4,False,1995,1762515793,17419,SLAM IV,153,9303484,HEROIC,https://cdn.steamusercontent.com/ugc/247198417...,9467224,Aurora Gaming,2025-11-07 11:43:13
1408,8546927307,True,31,4,True,1995,1762515793,17419,SLAM IV,153,9467224,Aurora Gaming,https://cdn.steamusercontent.com/ugc/130525837...,9303484,HEROIC,2025-11-07 11:43:13
0,8547022523,True,23,19,False,2312,1762520179,17419,SLAM IV,153,9303484,HEROIC,https://cdn.steamusercontent.com/ugc/247198417...,9467224,Aurora Gaming,2025-11-07 12:56:19
1407,8547022523,True,23,19,True,2312,1762520179,17419,SLAM IV,153,9467224,Aurora Gaming,https://cdn.steamusercontent.com/ugc/130525837...,9303484,HEROIC,2025-11-07 12:56:19


In [6]:
sample_id = recent_ids[0] if recent_ids else None
print(f"Sample match id: {sample_id}")
sample_match = fetch_match_detail(sample_id, session=session) if sample_id else {}
if sample_match:
    write_json(sample_match, Path("data/interim/sample_match_detail.json"))
    print("Sample match sauvegardée dans data/interim/sample_match_detail.json")
sample_match


Sample match id: 8547022523
Sample match sauvegardée dans data/interim/sample_match_detail.json


{'version': 22,
 'match_id': 8547022523,
 'draft_timings': [{'order': 1,
   'pick': False,
   'active_team': 2,
   'hero_id': 102,
   'player_slot': None,
   'extra_time': 130,
   'total_time_taken': 0},
  {'order': 2,
   'pick': False,
   'active_team': 3,
   'hero_id': 129,
   'player_slot': None,
   'extra_time': 130,
   'total_time_taken': 0},
  {'order': 3,
   'pick': False,
   'active_team': 3,
   'hero_id': 13,
   'player_slot': None,
   'extra_time': 130,
   'total_time_taken': 0},
  {'order': 4,
   'pick': False,
   'active_team': 3,
   'hero_id': 136,
   'player_slot': None,
   'extra_time': 130,
   'total_time_taken': 0},
  {'order': 5,
   'pick': False,
   'active_team': 3,
   'hero_id': 98,
   'player_slot': None,
   'extra_time': 130,
   'total_time_taken': 0},
  {'order': 6,
   'pick': False,
   'active_team': 3,
   'hero_id': 72,
   'player_slot': None,
   'extra_time': 130,
   'total_time_taken': 0},
  {'order': 7,
   'pick': False,
   'active_team': 3,
   'hero_id': 4

In [7]:
# Focus sur les champs de série / map et objectifs Roshan/Aegis
if sample_match:
    series_keys = ["match_id", "series_id", "series_type", "match_seq_num", "start_time", "leagueid", "radiant_team_id", "dire_team_id", "radiant_name", "dire_name"]
    print({k: sample_match.get(k) for k in series_keys})
    obj_df = pd.DataFrame(sample_match.get("objectives") or [])
    if not obj_df.empty:
        rosh_types = ["CHAT_MESSAGE_ROSHAN_KILL", "CHAT_MESSAGE_AEGIS", "building_kill"]
        if "type" in obj_df.columns:
            display(obj_df[obj_df["type"].isin(rosh_types)][[c for c in ["time", "type", "team", "key", "slot", "player_slot"] if c in obj_df.columns]].head(20))
        display(obj_df)
    else:
        print("Pas d'objectives dans le sample")


{'match_id': 8547022523, 'series_id': 1032436, 'series_type': 1, 'match_seq_num': 7178999777, 'start_time': 1762520179, 'leagueid': 17419, 'radiant_team_id': 9303484, 'dire_team_id': 9467224, 'radiant_name': 'HEROIC', 'dire_name': 'Aurora Gaming'}


Unnamed: 0,time,type,team,key,slot,player_slot
7,617,building_kill,,npc_dota_badguys_tower1_bot,0.0,0.0
10,904,building_kill,,npc_dota_badguys_tower1_mid,1.0,1.0
11,964,building_kill,,npc_dota_badguys_tower1_top,4.0,4.0
12,1140,CHAT_MESSAGE_ROSHAN_KILL,2.0,,,
13,1141,CHAT_MESSAGE_AEGIS,,,1.0,1.0
14,1146,building_kill,,npc_dota_goodguys_tower1_bot,5.0,128.0
15,1209,building_kill,,npc_dota_goodguys_tower1_mid,5.0,128.0
17,1739,CHAT_MESSAGE_ROSHAN_KILL,2.0,,,
18,1739,CHAT_MESSAGE_AEGIS,,,0.0,0.0
21,1926,building_kill,,npc_dota_badguys_tower2_mid,0.0,0.0


Unnamed: 0,time,type,slot,key,player_slot,value,killer,team,unit
0,139,CHAT_MESSAGE_FIRSTBLOOD,5.0,2,128.0,,,,
1,145,CHAT_MESSAGE_COURIER_LOST,,,,25.0,-1.0,2.0,
2,231,CHAT_MESSAGE_COURIER_LOST,,,,35.0,3.0,3.0,
3,287,CHAT_MESSAGE_COURIER_LOST,,,,35.0,131.0,2.0,
4,310,CHAT_MESSAGE_COURIER_LOST,,,,35.0,0.0,3.0,
5,460,CHAT_MESSAGE_COURIER_LOST,,,,45.0,3.0,3.0,
6,539,CHAT_MESSAGE_COURIER_LOST,,,,45.0,3.0,3.0,
7,617,building_kill,0.0,npc_dota_badguys_tower1_bot,0.0,,,,npc_dota_hero_juggernaut
8,629,CHAT_MESSAGE_COURIER_LOST,,,,50.0,131.0,2.0,
9,640,CHAT_MESSAGE_COURIER_LOST,,,,55.0,131.0,2.0,


In [8]:
# Visualiser les différentes games d'une même série (autour du sample)

series_matches = []
if sample_match:
    target_series = sample_match.get('series_id')
    target_league = sample_match.get('leagueid')
    team_set = {sample_match.get('radiant_team_id'), sample_match.get('dire_team_id')}
    candidates = matches_df.copy() if not matches_df.empty else pd.DataFrame()
    if 'opposing_team_id' in candidates.columns:
        candidates = candidates[(candidates['_source_team_id'].isin(team_set)) & (candidates['opposing_team_id'].isin(team_set))]
    if 'leagueid' in candidates.columns and target_league is not None:
        candidates = candidates[candidates['leagueid'] == target_league]
    candidate_ids = list(dict.fromkeys(candidates['match_id'].tolist())) if not candidates.empty else []
    print(f"Candidats même affiche/league: {len(candidate_ids)} (limite 10 pour fetch)")
    for mid in candidate_ids[:10]:
        try:
            m = fetch_match_detail(mid, session=session)
        except Exception as exc:  # noqa: BLE001
            print(f"Fetch KO pour {mid}: {exc}")
            continue
        if target_series is None or m.get('series_id') == target_series:
            series_matches.append({
                'match_id': mid,
                'series_id': m.get('series_id'),
                'series_type': m.get('series_type'),
                'start_time': m.get('start_time'),
                'radiant': m.get('radiant_name'),
                'dire': m.get('dire_name'),
                'radiant_win': m.get('radiant_win'),
                'leagueid': m.get('leagueid'),
                'match_seq_num': m.get('match_seq_num'),
            })
    if series_matches:
        df_series = pd.DataFrame(series_matches)
        df_series['start_dt'] = pd.to_datetime(df_series['start_time'], unit='s')
        df_series = df_series.sort_values(['start_time', 'match_seq_num'])
        df_series['map_num'] = df_series.groupby('series_id').cumcount() + 1
        df_series['radiant_score_series'] = df_series.groupby('series_id')['radiant_win'].cumsum()
        df_series['dire_score_series'] = df_series.groupby('series_id')['radiant_win'].transform(lambda s: s.expanding().count()) - df_series['radiant_score_series']
        display(df_series)
    else:
        print('Aucune autre game trouvée pour cette série (ou pas de series_id sur le sample).')
else:
    print('Pas de sample_match chargé.')


Candidats même affiche/league: 3 (limite 10 pour fetch)


Unnamed: 0,match_id,series_id,series_type,start_time,radiant,dire,radiant_win,leagueid,match_seq_num,start_dt,map_num,radiant_score_series,dire_score_series
1,8546927307,1032436,1,1762515793,HEROIC,Aurora Gaming,True,17419,7178914893,2025-11-07 11:43:13,1,1,0.0
0,8547022523,1032436,1,1762520179,HEROIC,Aurora Gaming,True,17419,7178999777,2025-11-07 12:56:19,2,2,0.0
