In [12]:
import requests
import time

match_ids = set()
target_ids = 400      # how many unique IDs we want to *try* downloading later
max_calls = 10        # max # of API calls to /proMatches

for i in range(max_calls):
    print(f"Calling /proMatches ({i+1}/{max_calls})...")
    r = requests.get("https://api.opendota.com/api/proMatches", timeout=10)
    if r.status_code != 200:
        print("  Got status", r.status_code, "- skipping this call")
        continue

    data = r.json()
    before = len(match_ids)
    for m in data:
        mid = m.get("match_id")
        if mid:
            match_ids.add(mid)
    after = len(match_ids)

    print(f"  Unique IDs so far: {after} (+{after - before} new)")

    if len(match_ids) >= target_ids:
        break

    time.sleep(1)  # be polite to the API

match_ids = list(match_ids)
print("\nTotal unique match IDs collected:", len(match_ids))
print("First 10 IDs:", match_ids[:10])


Calling /proMatches (1/10)...
  Unique IDs so far: 100 (+100 new)
Calling /proMatches (2/10)...
  Unique IDs so far: 100 (+0 new)
Calling /proMatches (3/10)...
  Unique IDs so far: 100 (+0 new)
Calling /proMatches (4/10)...
  Unique IDs so far: 100 (+0 new)
Calling /proMatches (5/10)...
  Unique IDs so far: 100 (+0 new)
Calling /proMatches (6/10)...
  Unique IDs so far: 100 (+0 new)
Calling /proMatches (7/10)...
  Unique IDs so far: 100 (+0 new)
Calling /proMatches (8/10)...
  Unique IDs so far: 100 (+0 new)
Calling /proMatches (9/10)...
  Unique IDs so far: 100 (+0 new)
Calling /proMatches (10/10)...
  Unique IDs so far: 100 (+0 new)

Total unique match IDs collected: 100
First 10 IDs: [8584435201, 8583921670, 8583598095, 8583288848, 8584158739, 8584676384, 8584771618, 8584162854, 8583851046, 8584067113]


In [14]:
import requests
import pandas as pd
import time

rows = []

success = 0
target_success = 60   # stop after 60 good matches (you can change this)
skipped_status = 0
skipped_players = 0
errors = 0

for idx, mid in enumerate(match_ids):
    if success >= target_success:
        break

    print(f"Fetching match {idx+1}/{len(match_ids)} (ID: {mid})...")

    try:
        r = requests.get(
            f"https://api.opendota.com/api/matches/{mid}",
            timeout=5
        )

        if r.status_code != 200:
            skipped_status += 1
            print(f"  Skipped (status {r.status_code})")
            continue

        match = r.json()

        players = match.get("players")
        if not players or len(players) != 10:
            skipped_players += 1
            print("  Skipped (invalid players list)")
            continue

        # ---- BASIC FEATURES (you can add more later) ----
        row = {
            "match_id": mid,
            "duration": match.get("duration"),
            "radiant_win": int(bool(match.get("radiant_win"))),
            "radiant_score": match.get("radiant_score"),
            "dire_score": match.get("dire_score"),
        }

        radiant_gpm = 0
        dire_gpm = 0
        radiant_xpm = 0
        dire_xpm = 0

        for p in players:
            if p.get("isRadiant"):
                radiant_gpm += p.get("gold_per_min", 0)
                radiant_xpm += p.get("xp_per_min", 0)
            else:
                dire_gpm += p.get("gold_per_min", 0)
                dire_xpm += p.get("xp_per_min", 0)

        row["radiant_gpm_total"] = radiant_gpm
        row["dire_gpm_total"] = dire_gpm
        row["radiant_xpm_total"] = radiant_xpm
        row["dire_xpm_total"] = dire_xpm

        rows.append(row)
        success += 1
        print(f"  ✅ Added (success count = {success})")

    except requests.exceptions.Timeout:
        errors += 1
        print("  Timeout — skipping")
        continue
    except Exception as e:
        errors += 1
        print("  Error:", e)
        continue

    time.sleep(0.3) # small delay so we don't hammer the API

print("\n=== Summary ===")
print("Successful matches:", success)
print("Skipped (status != 200):", skipped_status)
print("Skipped (bad players):", skipped_players)
print("Errors/Timeouts:", errors)

df = pd.DataFrame(rows)
print("\nDataFrame shape:", df.shape)
df.head()


Fetching match 1/100 (ID: 8584435201)...
  ✅ Added (success count = 1)
Fetching match 2/100 (ID: 8583921670)...
  ✅ Added (success count = 2)
Fetching match 3/100 (ID: 8583598095)...
  ✅ Added (success count = 3)
Fetching match 4/100 (ID: 8583288848)...
  ✅ Added (success count = 4)
Fetching match 5/100 (ID: 8584158739)...
  ✅ Added (success count = 5)
Fetching match 6/100 (ID: 8584676384)...
  ✅ Added (success count = 6)
Fetching match 7/100 (ID: 8584771618)...
  ✅ Added (success count = 7)
Fetching match 8/100 (ID: 8584162854)...
  ✅ Added (success count = 8)
Fetching match 9/100 (ID: 8583851046)...
  ✅ Added (success count = 9)
Fetching match 10/100 (ID: 8584067113)...
  ✅ Added (success count = 10)
Fetching match 11/100 (ID: 8583867438)...
  ✅ Added (success count = 11)
Fetching match 12/100 (ID: 8582948406)...
  ✅ Added (success count = 12)
Fetching match 13/100 (ID: 8583689273)...
  ✅ Added (success count = 13)
Fetching match 14/100 (ID: 8583777852)...
  ✅ Added (success count = 

Unnamed: 0,match_id,duration,radiant_win,radiant_score,dire_score,radiant_gpm_total,dire_gpm_total,radiant_xpm_total,dire_xpm_total
0,8584435201,1418,1,52,24,2872,2083,3172,1946
1,8583921670,2006,1,57,15,2582,1761,3637,2236
2,8583598095,1657,1,37,19,2458,1924,3107,2143
3,8583288848,2242,1,44,34,2910,2577,4635,3968
4,8584158739,2982,1,40,48,2828,2585,4178,3776


In [None]:
df.to_csv("dota_pro_sample_100.csv", index=False)
