# 03 - Full scrap (>= 2025) et export brut v2
Scrape toutes les games filtrées par date pour les équipes ciblées, récupère les détails complets et enregistre au format brut compatible (`json` + `pairedItem`).

In [6]:
from pathlib import Path
import sys

def _find_root():
    candidates = [Path.cwd()] + list(Path.cwd().parents[:4])
    for cand in candidates:
        if (cand / "src").exists():
            return cand
    return Path.cwd()

ROOT = _find_root()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))
print(f"Using project root: {ROOT}")


Using project root: /home/ju/Documents/Dev/Dota-Datas


In [7]:
from datetime import datetime, timezone
from pathlib import Path
import json
import pandas as pd
from dotenv import load_dotenv

from src.dota_data.api import (
    load_api_key,
    build_session,
    load_team_list,
    fetch_team_matches,
    filter_matches_since,
    unique_match_ids,
    annotate_matches_with_team,
    fetch_matches_chunked,
    wrap_raw_match,
    write_json,
)

load_dotenv()
api_key = load_api_key(load_env_file=False)
session = build_session(api_key)
teams = load_team_list(ROOT / "data/teams_to_look.csv")
print(f"Teams: {len(teams)}")


Teams: 24


In [8]:
# Charger les matches team-level depuis l'étape 01 si dispo
raw_path = Path("data/interim/team_matches_raw.json")
if raw_path.exists():
    team_matches = json.loads(raw_path.read_text())
else:
    team_matches = []
    for team in teams:
        team_matches.append({"team": team, "matches": fetch_team_matches(team["TeamID"], session=session)})
    write_json(team_matches, raw_path)
print(f"Matches team-level chargés: {len(team_matches)} entrées")


Matches team-level chargés: 24 entrées


In [9]:
flat_rows = []
for entry in team_matches:
    flat_rows.extend(annotate_matches_with_team(entry["matches"], entry["team"]["TeamID"], entry["team"]["TeamName"]))
matches_df = pd.DataFrame(flat_rows)
cutoff_ts = int(datetime(2025, 1, 1, tzinfo=timezone.utc).timestamp())
recent_rows = filter_matches_since(flat_rows, cutoff_ts)
recent_ids = unique_match_ids(recent_rows)
print(f"Matches >= 2025-01-01 (rows): {len(recent_rows)}, ids uniques: {len(recent_ids)}")
recent_ids[:5]


Matches >= 2025-01-01 (rows): 5391, ids uniques: 3879


[8547022523, 8546927307, 8531221816, 8531103673, 8531000461]

In [10]:
# Récupération détaillée avec chunking (sleep=1s)
CHUNK_SIZE = 100
OUT_DIR = Path("data/raw/chunks_v2")
PREFIX = "matches_v2"
RESUME = True

summary = fetch_matches_chunked(
    recent_ids,
    session=session,
    out_dir=OUT_DIR,
    chunk_size=CHUNK_SIZE,
    resume=RESUME,
    sleep=1.0,
    timeout=60,
    prefix=PREFIX,
    retry_failed=True,
)
print("Summary:", summary)


[chunk 1/39] fetching 100 matches (ids 8547022523..8359964297)
[chunk 1/39] saved 100 matches -> matches_v2_0000.json
[chunk 2/39] fetching 100 matches (ids 8359476895..8262319980)
[chunk 2/39] saved 100 matches -> matches_v2_0001.json
[chunk 3/39] fetching 100 matches (ids 8262234049..8534290019)
[chunk 3/39] saved 100 matches -> matches_v2_0002.json
[chunk 4/39] fetching 100 matches (ids 8534224202..8315699981)
[chunk 4/39] saved 100 matches -> matches_v2_0003.json
[chunk 5/39] fetching 100 matches (ids 8314111192..8183642521)
[chunk 5/39] saved 100 matches -> matches_v2_0004.json
[chunk 6/39] fetching 100 matches (ids 8181273873..8506088143)
[chunk 6/39] saved 100 matches -> matches_v2_0005.json
[chunk 7/39] fetching 100 matches (ids 8503284863..8239414567)
[chunk 7/39] saved 100 matches -> matches_v2_0006.json
[chunk 8/39] fetching 100 matches (ids 8239331133..8504223773)
[chunk 8/39] saved 100 matches -> matches_v2_0007.json
[chunk 9/39] fetching 100 matches (ids 8502548614..82619

In [11]:
# Format brut compatible et sauvegarde (combine tous les chunks)
chunk_files = sorted(OUT_DIR.glob(f"{PREFIX}_*.json"))
raw_payload = []
for cf in chunk_files:
    raw_payload.extend(json.loads(cf.read_text()))
retry_file = OUT_DIR / f"{PREFIX}_retry.json"
if retry_file.exists():
    raw_payload.extend(json.loads(retry_file.read_text()))
out_path = Path("data/raw/data_v2.json")
write_json(raw_payload, out_path)
print(f"Brut v2 combiné: {len(raw_payload)} matches -> {out_path} ({out_path.stat().st_size/1024/1024:.2f} MB)")


Brut v2 combiné: 3879 matches -> data/raw/data_v2.json (2299.64 MB)


In [12]:
# Stats rapides pour log
if raw_payload:
    sample_keys = list(raw_payload[0]['json'].keys())
    print(f"Champs top-level sample: {sample_keys[:15]} ... (total {len(sample_keys)})")
    print(f"Objectives présents: {bool(raw_payload[0]['json'].get('objectives'))}")
else:
    print('Aucun match combiné à inspecter.')


Champs top-level sample: ['version', 'match_id', 'draft_timings', 'teamfights', 'objectives', 'chat', 'radiant_gold_adv', 'radiant_xp_adv', 'pauses', 'cosmetics', 'players', 'leagueid', 'start_time', 'duration', 'series_id'] ... (total 56)
Objectives présents: True
