# 03 - Full scrap (>= 2025) et export brut v2
Scrape toutes les games filtrées par date pour les équipes ciblées, récupère les détails complets et enregistre au format brut compatible (`json` + `pairedItem`).

In [1]:
from pathlib import Path
import sys

def _find_root():
    candidates = [Path.cwd()] + list(Path.cwd().parents[:4])
    for cand in candidates:
        if (cand / "src").exists():
            return cand
    return Path.cwd()

ROOT = _find_root()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))
print(f"Using project root: {ROOT}")


Using project root: /home/ju/Documents/Dev/Dota-Datas


In [2]:
from datetime import datetime, timezone
from pathlib import Path
import json
import pandas as pd
from dotenv import load_dotenv

from src.dota_data.api import (
    load_api_key,
    build_session,
    load_team_list,
    fetch_team_matches,
    filter_matches_since,
    unique_match_ids,
    annotate_matches_with_team,
    fetch_matches_chunked,
    wrap_raw_match,
    write_json,
)

load_dotenv()
api_key = load_api_key(load_env_file=False)
session = build_session(api_key)
teams = load_team_list(ROOT / "data/teams_to_look.csv")
print(f"Teams: {len(teams)}")


Teams: 0


In [3]:
# Charger les matches team-level depuis l'étape 01 si dispo
raw_path = Path("data/interim/team_matches_raw.json")
if raw_path.exists():
    team_matches = json.loads(raw_path.read_text())
else:
    team_matches = []
    for team in teams:
        team_matches.append({"team": team, "matches": fetch_team_matches(team["TeamID"], session=session)})
    write_json(team_matches, raw_path)
print(f"Matches team-level chargés: {len(team_matches)} entrées")


Matches team-level chargés: 173 entrées


In [4]:
flat_rows = []
for entry in team_matches:
    flat_rows.extend(annotate_matches_with_team(entry["matches"], entry["team"]["TeamID"], entry["team"]["TeamName"]))
matches_df = pd.DataFrame(flat_rows)
cutoff_ts = int(datetime(2024, 1, 1, tzinfo=timezone.utc).timestamp())
recent_rows = filter_matches_since(flat_rows, cutoff_ts)
recent_ids = unique_match_ids(recent_rows)
print(f"Matches >= 2025-01-01 (rows): {len(recent_rows)}, ids uniques: {len(recent_ids)}")
recent_ids[:5]


Matches >= 2025-01-01 (rows): 23051, ids uniques: 14158


[8572758153, 8572646777, 8572546517, 8571364449, 8571247569]

In [None]:
# Récupération détaillée avec chunking (sleep=1s)
CHUNK_SIZE = 100
OUT_DIR = Path("data/raw/chunks_v2")
PREFIX = "matches_v2"
RESUME = True

summary = fetch_matches_chunked(
    recent_ids,
    session=session,
    out_dir=OUT_DIR,
    chunk_size=CHUNK_SIZE,
    resume=RESUME,
    sleep=1.0,
    timeout=60,
    prefix=PREFIX,
    retry_failed=True,
)
print("Summary:", summary)


[chunk 108/142] fetching 100 matches (ids 8115067937..8238800918)
[chunk 108/142] saved 100 matches -> matches_v2_0107.json
[chunk 109/142] fetching 100 matches (ids 8236639459..8509488345)
[chunk 109/142] saved 100 matches -> matches_v2_0108.json
[chunk 110/142] fetching 100 matches (ids 8509402374..7569457300)
[chunk 110/142] saved 100 matches -> matches_v2_0109.json
[chunk 111/142] fetching 100 matches (ids 7569040773..7826937718)
[chunk 111/142] saved 100 matches -> matches_v2_0110.json
[chunk 112/142] fetching 100 matches (ids 7826869189..7639413823)
[chunk 112/142] saved 100 matches -> matches_v2_0111.json
[chunk 113/142] fetching 100 matches (ids 7636204224..8171532827)
[chunk 113/142] saved 100 matches -> matches_v2_0112.json
[chunk 114/142] fetching 100 matches (ids 8171498115..8403202259)
[chunk 114/142] saved 100 matches -> matches_v2_0113.json
[chunk 115/142] fetching 100 matches (ids 8402021057..8186803129)
[chunk 115/142] saved 100 matches -> matches_v2_0114.json
[chunk 1

In [3]:
OUT_DIR = Path("data/raw/chunks_v2")
PREFIX = "matches_v2"

# Stream-merge chunks to avoid high memory usage
chunk_files = sorted(OUT_DIR.glob(f"{PREFIX}_*.json"))
retry_file = OUT_DIR / f"{PREFIX}_retry.json"
if retry_file.exists():
    chunk_files.append(retry_file)
out_path = Path("data/raw/data_v2.json")
out_path.parent.mkdir(parents=True, exist_ok=True)
count = 0
with out_path.open("w", encoding="utf-8") as f:
    f.write("[")
    first = True
    for cf in chunk_files:
        try:
            arr = json.loads(cf.read_text())
        except Exception as exc:
            print(f"Skipping {cf} (error {exc})")
            continue
        for item in arr:
            if not first:
                f.write(",")
            json.dump(item, f)
            first = False
            count += 1
    f.write("]")
print(f"Merged {count} matches into {out_path} ({out_path.stat().st_size/1024/1024:.2f} MB)")


Merged 14160 matches into data/raw/data_v2.json (4271.07 MB)


In [12]:
# Stats rapides pour log
if raw_payload:
    sample_keys = list(raw_payload[0]['json'].keys())
    print(f"Champs top-level sample: {sample_keys[:15]} ... (total {len(sample_keys)})")
    print(f"Objectives présents: {bool(raw_payload[0]['json'].get('objectives'))}")
else:
    print('Aucun match combiné à inspecter.')


Champs top-level sample: ['version', 'match_id', 'draft_timings', 'teamfights', 'objectives', 'chat', 'radiant_gold_adv', 'radiant_xp_adv', 'pauses', 'cosmetics', 'players', 'leagueid', 'start_time', 'duration', 'series_id'] ... (total 56)
Objectives présents: True
