# 04 - One-shot rescrap for a single team

Rescrape matches for one team, filtered since a target date, with retry and error log. Appends new matches to the existing `data/raw/data_v2.json` in the same raw format.

In [1]:
import sys
import json
from pathlib import Path
from datetime import datetime, timezone
from collections import OrderedDict
import time

def _find_root():
    cand = Path.cwd()
    for c in [cand, *cand.parents]:
        if (c / "src").exists() and (c / "data").exists():
            return c
    return cand

ROOT = _find_root()
if str(ROOT) not in sys.path:
    sys.path.append(str(ROOT))
print(f"Using ROOT={ROOT}")

from src.dota_data.api import (
    load_api_key,
    build_session,
    fetch_team_matches,
    filter_matches_since,
    unique_match_ids,
    fetch_match_details,
    wrap_raw_match,
)

# Parameters
TEAM_ID = 9572001  # change as needed
MIN_DATE = datetime(2024, 1, 1, tzinfo=timezone.utc)
MIN_START_TIME = int(MIN_DATE.timestamp())
RAW_PATH = ROOT / "data" / "raw" / "data_v2.json"
ERROR_LOG = ROOT / "data" / "interim" / f"rescrap_errors_team_{TEAM_ID}.json"

RAW_PATH.parent.mkdir(parents=True, exist_ok=True)
ERROR_LOG.parent.mkdir(parents=True, exist_ok=True)

Using ROOT=/home/ju/Documents/Dev/Dota-Datas


In [2]:
# Load existing raw data
if RAW_PATH.exists():
    raw_data = json.loads(RAW_PATH.read_text())
    print(f"Loaded existing raw entries: {len(raw_data)}")
else:
    raw_data = []
    print("No existing raw file, starting fresh")

existing_ids = OrderedDict()
for item in raw_data:
    mid = item.get("json", {}).get("match_id")
    if isinstance(mid, int):
        existing_ids[mid] = item
print(f"Existing unique match_ids: {len(existing_ids)}")

Loaded existing raw entries: 14170
Existing unique match_ids: 14170


In [3]:
# Build session
api_key = load_api_key()
session = build_session(api_key)
print("Session ready")

# Fetch team matches and filter
matches = fetch_team_matches(TEAM_ID, session=session)
matches = filter_matches_since(matches, MIN_START_TIME)
print(f"Team matches since {MIN_DATE.date()}: {len(matches)}")

new_match_ids = [mid for mid in unique_match_ids(matches) if mid not in existing_ids]
print(f"New match_ids to fetch (not already in raw): {len(new_match_ids)}")

Session ready
Team matches since 2024-01-01: 321
New match_ids to fetch (not already in raw): 12


In [4]:
# Fetch details with retries
MAX_RETRIES = 3
SLEEP_BETWEEN = 1.0
errors = []
details = []
remaining = list(new_match_ids)
for attempt in range(1, MAX_RETRIES + 1):
    if not remaining:
        break
    print(f"Attempt {attempt}: fetching {len(remaining)} matches")
    batch, err = fetch_match_details(remaining, session=session, sleep=SLEEP_BETWEEN)
    details.extend(batch)
    errors.extend(err)
    remaining = [mid for mid, exc in err]
    if remaining:
        time.sleep(2)

if remaining:
    print(f"Still failing after retries: {len(remaining)}")
else:
    print("All match details fetched")

if errors:
    ERROR_LOG.write_text(json.dumps([(mid, str(exc)) for mid, exc in errors], indent=2))
    print(f"Logged errors to {ERROR_LOG}")

Attempt 1: fetching 12 matches
All match details fetched


In [6]:
# Wrap and merge (streamed to avoid large RAM usage)
wrapped = [wrap_raw_match(d) for d in details if isinstance(d, dict) and d.get("match_id")]
print(f"Wrapped new matches: {len(wrapped)}")

out_path = RAW_PATH
out_path.parent.mkdir(parents=True, exist_ok=True)
count_existing = len(existing_ids)
count_new = len(wrapped)
print(f"Existing entries: {count_existing}")
print(f"New entries: {count_new}")

# Stream write: start with existing, then append/overwrite with new
combined = existing_ids.copy()
for item in wrapped:
    mid = item["json"].get("match_id")
    combined[mid] = item

with out_path.open('w', encoding='utf-8') as f:
    f.write('[')
    first = True
    for item in combined.values():
        if not first:
            f.write(',')
        f.write(json.dumps(item))
        first = False
    f.write(']')
print(f"Wrote updated raw to {out_path} (entries={len(combined)})")


Wrapped new matches: 12
Existing entries: 14158
New entries: 12
Wrote updated raw to /home/ju/Documents/Dev/Dota-Datas/data/raw/data_v2.json (entries=14170)


### Summary
- Existing entries before merge
- Matches fetched since 2024-01-01
- New details fetched and merged
- Errors logged (if any) to interim