In [1]:
import requests
import time

# ===== API KEY CONFIGURATION =====
# Get your free API key from: https://www.opendota.com/
# Leave as None if you don't have an API key (will use lower rate limits)
API_KEY = "aaa8d694-b754-4f74-b0d3-fbab49254269"  # Replace with your API key: "your-api-key-here"
# ==================================

match_ids = set()
target_ids = 20000    # Target: 20,000 match IDs to ensure 10M+ float points (~500-1000 floats per match)
max_calls = 200       # max # of API calls to /proMatches (100 matches per call = 20,000 total)
less_than_match_id = None  # Start with most recent matches

# Optimized for API key usage (faster with higher rate limits)
if API_KEY:
    base_delay = 1  # Faster delay with API key
    max_retries = 2  # Fewer retries needed with API key
    max_consecutive_failures = 5  # More tolerance with API key
else:
    base_delay = 5  # Slower without API key
    max_retries = 3
    max_consecutive_failures = 2

consecutive_failures = 0  # Track consecutive failures

for i in range(max_calls):
    print(f"Calling /proMatches ({i+1}/{max_calls})...")
    
    # Build URL with pagination parameter
    url = "https://api.opendota.com/api/proMatches"
    params = {}
    if less_than_match_id:
        params["less_than_match_id"] = less_than_match_id
    if API_KEY:
        params["api_key"] = API_KEY
    
    # Retry logic for 429 errors
    retry_count = 0
    success = False
    
    while retry_count <= max_retries and not success:
        r = requests.get(url, params=params, timeout=10)
        
        if r.status_code == 429:
            # Rate limited - wait with exponential backoff
            retry_after = r.headers.get('Retry-After')
            if retry_after:
                wait_time = int(retry_after)
                # With API key, use shorter waits
                if API_KEY:
                    wait_time = max(wait_time, 10)  # Minimum 10s with API key
                else:
                    wait_time = max(wait_time, 60)  # Minimum 60s without API key
                print(f"  Rate limited! Waiting {wait_time} seconds (from Retry-After header)...")
            else:
                # Exponential backoff - faster with API key
                if API_KEY:
                    wait_time = 10 * (2 ** retry_count)  # 10s, 20s, 40s with API key
                else:
                    wait_time = 60 * (2 ** retry_count)  # 60s, 120s, 240s without API key
                print(f"  Rate limited! Waiting {wait_time} seconds (exponential backoff, attempt {retry_count + 1}/{max_retries + 1})...")
            
            time.sleep(wait_time)
            retry_count += 1
            continue
        
        if r.status_code != 200:
            print(f"  Got status {r.status_code} - skipping this call")
            break  # Break out of retry loop for non-429 errors
        
        # Success!
        success = True
        data = r.json()
        consecutive_failures = 0  # Reset failure counter on success
    
    # If we didn't get a successful response, skip this iteration
    if not success:
        consecutive_failures += 1
        print(f"  Failed after retries, skipping... (consecutive failures: {consecutive_failures})")
        
        # If too many consecutive failures, stop to avoid wasting time
        if consecutive_failures >= max_consecutive_failures:
            print(f"\n‚ö†Ô∏è  Stopping: {max_consecutive_failures} consecutive failures. API is heavily rate-limited.")
            print(f"   Collected {len(match_ids)} match IDs so far.")
            print("   RECOMMENDATIONS:")
            print("   1. Wait 10-15 minutes before trying again (rate limit window may reset)")
            print("   2. Consider getting an OpenDota API key for higher rate limits")
            print("   3. Reduce max_calls and target_ids to collect data more slowly")
            print("   4. Run this cell multiple times over several hours/days")
            break
        
        # Add extra delay after a failure to give API time to recover (shorter with API key)
        wait_after_failure = base_delay * 2 if not API_KEY else base_delay
        print(f"  Waiting {wait_after_failure} seconds before next attempt...")
        time.sleep(wait_after_failure)
        continue
    
    # If we got no data or same matches, break
    if not data:
        print("  No more matches available")
        break
    
    before = len(match_ids)
    oldest_match_id = None
    
    for m in data:
        mid = m.get("match_id")
        if mid:
            match_ids.add(mid)
            # Track the oldest (smallest) match_id for pagination
            if oldest_match_id is None or mid < oldest_match_id:
                oldest_match_id = mid
    
    after = len(match_ids)
    print(f"  Unique IDs so far: {after} (+{after - before} new)")

    if len(match_ids) >= target_ids:
        break
    
    # Update pagination parameter to get older matches
    if oldest_match_id:
        less_than_match_id = oldest_match_id
    else:
        break  # No new matches found
    
    time.sleep(base_delay)  # be polite to the API

match_ids = list(match_ids)
print("\nTotal unique match IDs collected:", len(match_ids))
print("First 10 IDs:", match_ids[:10])


Calling /proMatches (1/200)...
  Unique IDs so far: 100 (+100 new)
Calling /proMatches (2/200)...
  Unique IDs so far: 200 (+100 new)
Calling /proMatches (3/200)...
  Unique IDs so far: 300 (+100 new)
Calling /proMatches (4/200)...
  Unique IDs so far: 400 (+100 new)
Calling /proMatches (5/200)...
  Unique IDs so far: 500 (+100 new)
Calling /proMatches (6/200)...
  Unique IDs so far: 600 (+100 new)
Calling /proMatches (7/200)...
  Unique IDs so far: 700 (+100 new)
Calling /proMatches (8/200)...
  Unique IDs so far: 800 (+100 new)
Calling /proMatches (9/200)...
  Unique IDs so far: 900 (+100 new)
Calling /proMatches (10/200)...
  Unique IDs so far: 1000 (+100 new)
Calling /proMatches (11/200)...
  Unique IDs so far: 1100 (+100 new)
Calling /proMatches (12/200)...
  Unique IDs so far: 1200 (+100 new)
Calling /proMatches (13/200)...
  Unique IDs so far: 1300 (+100 new)
Calling /proMatches (14/200)...
  Unique IDs so far: 1400 (+100 new)
Calling /proMatches (15/200)...
  Unique IDs so far:

In [2]:
import pandas as pd
import numpy as np

class DataPreprocessing:
    """Class to extract and organize Dota 2 match data from OpenDota API responses."""
    
    def __init__(self):
        # Initialize tables as empty dataframes
        self.matches = pd.DataFrame()
        self.players = pd.DataFrame()
        self.objectives = pd.DataFrame()
        self.advantages = pd.DataFrame()
        self.events = pd.DataFrame()
        self.abilities = pd.DataFrame()
        self.wards = pd.DataFrame()

    def get_match(self, match):
        """Get general information from the match and append to self.matches."""
        fields = ['match_id', 'match_seq_num', 'patch', 'region', 'start_time', 'duration',
                  'game_mode', 'skill', 'first_blood_time', 'barracks_status_dire',
                  'barracks_status_radiant', 'tower_status_dire', 'tower_status_radiant',
                  'dire_score', 'radiant_score', 'radiant_win']
        proc_match = {key: [match.get(key)] for key in fields}
        new_df = pd.DataFrame(proc_match)
        self.matches = pd.concat([self.matches, new_df], ignore_index=True)

    def get_match_objectives(self, match):
        """Get game objectives like Roshan and towers and append to self.objectives dataframe."""
        fields = ['time', 'type', 'unit', 'key', 'slot', 'player_slot']
        objectives = []
        if match.get('objectives'):
            for item in match['objectives']:
                obj = {'match_id': match['match_id']}
                for field in fields:
                    obj[field] = item.get(field, np.nan)
                objectives.append(obj.copy())
        if objectives:
            new_df = pd.DataFrame(objectives)
            self.objectives = pd.concat([self.objectives, new_df], ignore_index=True)

    def get_match_advantages(self, match):
        """Get radiant gold and xp advantage for each minute and append to self.advantages dataframe."""
        advantages = []
        if match.get('radiant_gold_adv'):  # Gold advantage (gold_or_xp = 0)
            for i, value in enumerate(match['radiant_gold_adv']):
                adv = {
                    'match_id': match['match_id'],
                    'minute': i,
                    'gold_or_xp': 0,
                    'value': int(value)
                }
                advantages.append(adv.copy())
        if match.get('radiant_xp_adv'):  # XP advantage (gold_or_xp = 1)
            for i, value in enumerate(match['radiant_xp_adv']):
                adv = {
                    'match_id': match['match_id'],
                    'minute': i,
                    'gold_or_xp': 1,
                    'value': int(value)
                }
                advantages.append(adv.copy())
        if advantages:
            new_df = pd.DataFrame(advantages)
            self.advantages = pd.concat([self.advantages, new_df], ignore_index=True)

    def get_players_events(self, match):
        """Get events for each player (kills, runes, bb and purchases) and append to self.events."""
        events = []
        for player in match.get('players', []):
            if player.get('buyback_log'):  # Player's Buybacks
                for bb in player['buyback_log']:
                    tmp = {
                        'match_id': match['match_id'],
                        'account_id': player.get('account_id'),
                        'player_slot': player.get('player_slot'),
                        'hero_id': player.get('hero_id'),
                        'time': bb.get('time'),
                        'key': np.nan,
                        'event': 'buyback'
                    }
                    events.append(tmp.copy())
            if player.get('kills_log'):  # Player's kills on enemy heroes
                for kill in player['kills_log']:
                    tmp = {
                        'match_id': match['match_id'],
                        'account_id': player.get('account_id'),
                        'player_slot': player.get('player_slot'),
                        'hero_id': player.get('hero_id'),
                        'time': kill.get('time'),
                        'key': kill.get('key'),
                        'event': 'kill'
                    }
                    events.append(tmp.copy())
            if player.get('runes_log'):  # Runes picked
                for rune in player['runes_log']:
                    tmp = {
                        'match_id': match['match_id'],
                        'account_id': player.get('account_id'),
                        'player_slot': player.get('player_slot'),
                        'hero_id': player.get('hero_id'),
                        'time': rune.get('time'),
                        'key': rune.get('key'),
                        'event': 'rune'
                    }
                    events.append(tmp.copy())
            if player.get('purchase_log'):
                for item in player['purchase_log']:  # Items purchased
                    tmp = {
                        'match_id': match['match_id'],
                        'account_id': player.get('account_id'),
                        'player_slot': player.get('player_slot'),
                        'hero_id': player.get('hero_id'),
                        'time': item.get('time'),
                        'key': item.get('key'),
                        'event': 'purchase'
                    }
                    events.append(tmp.copy())
        if events:
            new_df = pd.DataFrame(events)
            self.events = pd.concat([self.events, new_df], ignore_index=True)

    def get_ability_upgrades(self, match):
        """Get skill upgrades for each player. Columns goes from 1 to 25 for each possible skill upgrade."""
        ability_upgrades = []
        for player in match.get('players', []):
            if player.get('ability_upgrades_arr'):
                tmp = {
                    'match_id': match['match_id'],
                    'account_id': player.get('account_id'),
                    'player_slot': player.get('player_slot'),
                    'hero_id': player.get('hero_id'),
                }
                for i in range(25):
                    tmp['skill_upgrade_' + str(i + 1)] = np.nan
                for i, value in enumerate(player['ability_upgrades_arr']):
                    if i < 25:
                        tmp['skill_upgrade_' + str(i + 1)] = value
                ability_upgrades.append(tmp.copy())
        if ability_upgrades:
            new_df = pd.DataFrame(ability_upgrades)
            self.abilities = pd.concat([self.abilities, new_df], ignore_index=True)

    def get_wards(self, match):
        """Get time, position, slot and hero for each ward placed and append to self.wards dataframe."""
        wards = []
        for player in match.get('players', []):
            if player.get('obs_log'):  # Observer wards (type = 0)
                for item in player['obs_log']:
                    ward = {
                        'match_id': match['match_id'],
                        'account_id': player.get('account_id'),
                        'player_slot': player.get('player_slot'),
                        'hero_id': player.get('hero_id'),
                        'time': item.get('time'),
                        'x': item.get('x'),
                        'y': item.get('y'),
                        'type': 0
                    }
                    wards.append(ward.copy())
            if player.get('sen_log'):  # Sentry wards (type = 1)
                for item in player['sen_log']:
                    ward = {
                        'match_id': match['match_id'],
                        'account_id': player.get('account_id'),
                        'player_slot': player.get('player_slot'),
                        'hero_id': player.get('hero_id'),
                        'time': item.get('time'),
                        'x': item.get('x'),
                        'y': item.get('y'),
                        'type': 1
                    }
                    wards.append(ward.copy())
        if wards:
            new_df = pd.DataFrame(wards)
            self.wards = pd.concat([self.wards, new_df], ignore_index=True)

    def get_players(self, match):
        """Get match information for each player and append to self.players dataframe."""
        fields = ['player_slot', 'account_id', 'hero_id', 'kills', 'deaths',
                  'assists', 'last_hits', 'denies', 'gold_per_min', 'xp_per_min',
                  'gold_spent', 'hero_damage', 'hero_healing', 'tower_damage',
                  'level', 'party_size', 'item_0', 'item_1', 'item_2', 'item_3',
                  'item_4', 'item_5', 'camps_stacked', 'creeps_stacked', 'obs_placed', 'sen_placed',
                  'purchase_tpscroll', 'rune_pickups', 'roshans_killed', 'towers_killed', 'win']
        players = []
        for item in match.get('players', []):
            player = {'match_id': match['match_id']}
            for field in fields:
                player[field] = item.get(field, np.nan)
            players.append(player.copy())
        if players:
            new_df = pd.DataFrame(players)
            self.players = pd.concat([self.players, new_df], ignore_index=True)

    def get_all_current_match_tables(self, match_details):
        """Get all tables from a current match."""
        self.get_match(match_details)
        self.get_players(match_details)
        self.get_match_objectives(match_details)
        self.get_match_advantages(match_details)
        self.get_ability_upgrades(match_details)
        self.get_players_events(match_details)
        self.get_wards(match_details)

# Initialize the data preprocessor
preprocessor = DataPreprocessing()


In [3]:
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock

# ===== API KEY CONFIGURATION =====
# Get your free API key from: https://www.opendota.com/
# Leave as None if you don't have an API key (will use lower rate limits)
# IMPORTANT: Use the SAME API key as in Cell 0!
API_KEY = "aaa8d694-b754-4f74-b0d3-fbab49254269"  # Replace with your API key: "your-api-key-here"
# ==================================

# Thread-safe counters
success = 0
target_success = 15000   # Target: 15,000 matches to ensure 10M+ float points (~500-1000 floats per match)
skipped_status = 0
skipped_players = 0
errors = 0
counter_lock = Lock()

# Optimized for API key usage (faster with higher rate limits)
if API_KEY:
    base_delay = 0.1  # Very fast with API key
    max_retries = 2
    max_workers = 10  # More concurrent requests with API key
else:
    base_delay = 0.3  # Slower without API key
    max_retries = 2
    max_workers = 5  # Fewer concurrent requests without API key

def fetch_match(mid, idx, total):
    """Fetch a single match with retry logic."""
    global success, skipped_status, skipped_players, errors
    
    retry_count = 0
    request_success = False
    match = None
    
    while retry_count <= max_retries and not request_success:
        try:
            params = {}
            if API_KEY:
                params['api_key'] = API_KEY
            r = requests.get(
                f"https://api.opendota.com/api/matches/{mid}",
                params=params,
                timeout=5
            )

            if r.status_code == 429:
                # Rate limited - wait with exponential backoff (faster with API key)
                retry_after = r.headers.get('Retry-After')
                if retry_after:
                    wait_time = int(retry_after)
                    if API_KEY:
                        wait_time = max(wait_time, 5)  # Minimum 5s with API key
                    else:
                        wait_time = max(wait_time, 10)  # Minimum 10s without API key
                else:
                    # Exponential backoff - faster with API key
                    if API_KEY:
                        wait_time = (2 ** retry_count) * base_delay * 2  # Very short with API key
                    else:
                        wait_time = (2 ** retry_count) * base_delay * 4  # Longer without API key
                
                time.sleep(wait_time)
                retry_count += 1
                continue

            if r.status_code != 200:
                with counter_lock:
                    skipped_status += 1
                return None  # Skip this match

            # Success - parse the match data
            match = r.json()
            request_success = True

        except requests.exceptions.Timeout:
            with counter_lock:
                errors += 1
            return None
        except Exception as e:
            with counter_lock:
                errors += 1
            return None
    
    if not request_success or match is None:
        return None

    # Validate players
    players = match.get("players")
    if not players or len(players) != 10:
        with counter_lock:
            skipped_players += 1
        return None

    # Return match data for processing
    return match

# Process matches concurrently
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit enough tasks (submit more than target to account for failures)
    tasks_to_submit = min(len(match_ids), target_success * 2)
    future_to_match = {}
    
    for idx in range(tasks_to_submit):
        mid = match_ids[idx]
        future = executor.submit(fetch_match, mid, idx, len(match_ids))
        future_to_match[future] = (idx, mid)
    
    # Process completed tasks
    for future in as_completed(future_to_match):
        with counter_lock:
            current_success = success
        if current_success >= target_success:
            break
            
        idx, mid = future_to_match[future]
        try:
            match = future.result()
            
            if match is None:
                continue
            
            # Process the match data
            try:
                preprocessor.get_all_current_match_tables(match)
                with counter_lock:
                    success += 1
                    current_success = success
                
                # Print progress every 100 matches or at milestones
                if current_success % 100 == 0 or current_success in [1, 10, 50, 100, 500, 1000, 5000, 10000]:
                    progress_pct = (current_success / target_success) * 100
                    print(f"  ‚úÖ Match {idx+1} (ID: {mid}) - Progress: {current_success:,}/{target_success:,} ({progress_pct:.1f}%)")
                elif current_success <= 10:
                    print(f"  ‚úÖ Match {idx+1} (ID: {mid}) - Success count: {current_success}")
            except Exception as e:
                with counter_lock:
                    errors += 1
                print(f"  Error processing match {idx+1}: {e}")
                
        except Exception as e:
            with counter_lock:
                errors += 1
            print(f"  Exception for match {idx+1}: {e}")
        
        # Small delay to avoid overwhelming the API
        time.sleep(base_delay / max_workers)

print("\n=== Summary ===")
print("Successful matches:", success)
print("Skipped (status != 200):", skipped_status)
print("Skipped (bad players):", skipped_players)
print("Errors/Timeouts:", errors)

print("\n=== DataFrames Created ===")
print(f"Matches: {len(preprocessor.matches)} rows")
print(f"Players: {len(preprocessor.players)} rows")
print(f"Objectives: {len(preprocessor.objectives)} rows")
print(f"Advantages: {len(preprocessor.advantages)} rows")
print(f"Events: {len(preprocessor.events)} rows")
print(f"Abilities: {len(preprocessor.abilities)} rows")
print(f"Wards: {len(preprocessor.wards)} rows")

# Calculate total float points
total_floats = 0
for df_name, df in [
    ('matches', preprocessor.matches),
    ('players', preprocessor.players),
    ('objectives', preprocessor.objectives),
    ('advantages', preprocessor.advantages),
    ('events', preprocessor.events),
    ('abilities', preprocessor.abilities),
    ('wards', preprocessor.wards)
]:
    if len(df) > 0:
        # Count numeric columns (float/int types)
        numeric_cols = df.select_dtypes(include=['float64', 'int64', 'float32', 'int32']).shape[1]
        floats_in_df = len(df) * numeric_cols
        total_floats += floats_in_df
        print(f"  {df_name}: ~{floats_in_df:,} floats ({len(df)} rows √ó {numeric_cols} numeric cols)")

print(f"\n{'='*50}")
print(f"üìä TOTAL FLOAT POINTS: {total_floats:,}")
print(f"   Target: 10,000,000")
if total_floats >= 10_000_000:
    print(f"   ‚úÖ REQUIREMENT MET! ({total_floats/10_000_000:.2f}x target)")
else:
    print(f"   ‚ö†Ô∏è  Need {10_000_000 - total_floats:,} more floats ({((10_000_000 - total_floats) / total_floats * len(preprocessor.matches)):.0f} more matches estimated)")
print(f"{'='*50}")

# Display the matches dataframe as the main output
print("\n=== Matches DataFrame ===")
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
preprocessor.matches


  ‚úÖ Match 2 (ID: 8461484041) - Progress: 1/15,000 (0.0%)
  ‚úÖ Match 3 (ID: 8466464796) - Success count: 2
  ‚úÖ Match 10 (ID: 8427536461) - Success count: 3
  ‚úÖ Match 8 (ID: 8333951043) - Success count: 4
  ‚úÖ Match 1 (ID: 8529379331) - Success count: 5
  ‚úÖ Match 11 (ID: 8267890776) - Success count: 6
  ‚úÖ Match 6 (ID: 8253997110) - Success count: 7
  ‚úÖ Match 4 (ID: 8506048541) - Success count: 8
  ‚úÖ Match 7 (ID: 8452440129) - Success count: 9
  ‚úÖ Match 13 (ID: 8285847659) - Progress: 10/15,000 (0.1%)
  ‚úÖ Match 54 (ID: 8390574398) - Progress: 50/15,000 (0.3%)
  ‚úÖ Match 98 (ID: 8313504296) - Progress: 100/15,000 (0.7%)
  ‚úÖ Match 203 (ID: 8541045969) - Progress: 200/15,000 (1.3%)
  ‚úÖ Match 298 (ID: 8407877561) - Progress: 300/15,000 (2.0%)
  ‚úÖ Match 397 (ID: 8563853963) - Progress: 400/15,000 (2.7%)
  ‚úÖ Match 501 (ID: 8551271705) - Progress: 500/15,000 (3.3%)
  ‚úÖ Match 601 (ID: 8459915275) - Progress: 600/15,000 (4.0%)
  ‚úÖ Match 707 (ID: 8492421869) - Progr

Unnamed: 0,match_id,match_seq_num,patch,region,start_time,duration,game_mode,skill,first_blood_time,barracks_status_dire,barracks_status_radiant,tower_status_dire,tower_status_radiant,dire_score,radiant_score,radiant_win
0,8461484041,7107233430,58,8,1757855221,931,2,,159,63,63,1974,2047,2,30,True
1,8466464796,7111457840,58,8,1758126389,2236,2,,14,63,63,1824,1974,21,46,True
2,8427536461,7078743852,58,3,1755843728,1751,2,,154,48,63,384,1983,20,35,True
3,8333951043,7000954708,58,3,1749884865,2228,2,,110,63,51,1830,902,51,39,False
4,8529379331,7164154889,58,3,1761483937,1916,2,,0,49,63,1792,1847,14,22,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,8281489015,6958028177,57,3,1746448775,2005,2,,110,63,63,390,1972,34,39,True
14996,8346893969,7011577920,58,3,1750695617,1974,2,,231,63,0,1982,1536,23,11,False
14997,8382938724,7041332102,58,3,1753070696,1975,2,,208,63,3,1956,518,42,32,False
14998,8307572410,6979383660,58,8,1748183574,1904,2,,99,3,63,4,2039,12,49,True


In [None]:
import os

# Create Data_Individual folder if it doesn't exist
os.makedirs("Data_Individual", exist_ok=True)

# Save all dataframes to CSV files in Data_Individual folder
preprocessor.matches.to_csv("Data_Individual/dota_pro_matches.csv", index=False)
preprocessor.players.to_csv("Data_Individual/dota_pro_players.csv", index=False)
preprocessor.objectives.to_csv("Data_Individual/dota_pro_objectives.csv", index=False)
preprocessor.advantages.to_csv("Data_Individual/dota_pro_advantages.csv", index=False)
preprocessor.events.to_csv("Data_Individual/dota_pro_events.csv", index=False)
preprocessor.abilities.to_csv("Data_Individual/dota_pro_abilities.csv", index=False)
preprocessor.wards.to_csv("Data_Individual/dota_pro_wards.csv", index=False)

print("‚úÖ All individual dataframes saved to Data_Individual folder!")


All dataframes saved to CSV files!


In [None]:
# Combine all dataframes into one comprehensive CSV file
# Start with matches as the base
combined_df = preprocessor.matches.copy()

# Aggregate player statistics by team
if len(preprocessor.players) > 0:
    # Separate radiant and dire players
    players_df = preprocessor.players.copy()
    players_df['is_radiant'] = players_df['player_slot'] < 5
    
    # Aggregate by match and team
    player_agg = players_df.groupby(['match_id', 'is_radiant']).agg({
        'kills': 'sum',
        'deaths': 'sum',
        'assists': 'sum',
        'last_hits': 'sum',
        'denies': 'sum',
        'gold_per_min': 'sum',
        'xp_per_min': 'sum',
        'gold_spent': 'sum',
        'hero_damage': 'sum',
        'hero_healing': 'sum',
        'tower_damage': 'sum',
        'level': 'mean',
        'obs_placed': 'sum',
        'sen_placed': 'sum',
        'roshans_killed': 'sum',
        'towers_killed': 'sum'
    }).reset_index()
    
    # Pivot to get radiant and dire columns
    player_pivot = player_agg.pivot(index='match_id', columns='is_radiant', 
                                     values=['kills', 'deaths', 'assists', 'last_hits', 'denies',
                                            'gold_per_min', 'xp_per_min', 'gold_spent', 
                                            'hero_damage', 'hero_healing', 'tower_damage',
                                            'level', 'obs_placed', 'sen_placed', 
                                            'roshans_killed', 'towers_killed'])
    
    # Flatten column names
    player_pivot.columns = [f"{col[0]}_{'radiant' if col[1] else 'dire'}" for col in player_pivot.columns]
    player_pivot = player_pivot.reset_index()
    
    # Merge with combined_df
    combined_df = combined_df.merge(player_pivot, on='match_id', how='left')

# Aggregate objectives (count by type)
if len(preprocessor.objectives) > 0:
    obj_counts = preprocessor.objectives.groupby(['match_id', 'type']).size().reset_index(name='count')
    obj_pivot = obj_counts.pivot(index='match_id', columns='type', values='count').fillna(0)
    obj_pivot.columns = [f"objective_type_{col}" for col in obj_pivot.columns]
    obj_pivot = obj_pivot.reset_index()
    combined_df = combined_df.merge(obj_pivot, on='match_id', how='left')

# Aggregate advantages (get final advantage values)
if len(preprocessor.advantages) > 0:
    # Get final gold and XP advantage
    final_adv = preprocessor.advantages.groupby(['match_id', 'gold_or_xp']).last().reset_index()
    final_adv_pivot = final_adv.pivot(index='match_id', columns='gold_or_xp', values='value')
    final_adv_pivot.columns = ['final_gold_advantage', 'final_xp_advantage']
    final_adv_pivot = final_adv_pivot.reset_index()
    combined_df = combined_df.merge(final_adv_pivot, on='match_id', how='left')

# Aggregate events (count by event type)
if len(preprocessor.events) > 0:
    event_counts = preprocessor.events.groupby(['match_id', 'event']).size().reset_index(name='count')
    event_pivot = event_counts.pivot(index='match_id', columns='event', values='count').fillna(0)
    event_pivot.columns = [f"event_{col}_count" for col in event_pivot.columns]
    event_pivot = event_pivot.reset_index()
    combined_df = combined_df.merge(event_pivot, on='match_id', how='left')

# Aggregate wards (count by type)
if len(preprocessor.wards) > 0:
    ward_counts = preprocessor.wards.groupby(['match_id', 'type']).size().reset_index(name='count')
    ward_pivot = ward_counts.pivot(index='match_id', columns='type', values='count').fillna(0)
    ward_pivot.columns = ['observer_wards_placed', 'sentry_wards_placed']
    ward_pivot = ward_pivot.reset_index()
    combined_df = combined_df.merge(ward_pivot, on='match_id', how='left')

# Aggregate abilities (count skill upgrades per match)
if len(preprocessor.abilities) > 0:
    # Count non-null skill upgrades per match
    abilities_df = preprocessor.abilities.copy()
    skill_cols = [col for col in abilities_df.columns if col.startswith('skill_upgrade_')]
    abilities_df['total_skill_upgrades'] = abilities_df[skill_cols].notna().sum(axis=1)
    abilities_agg = abilities_df.groupby('match_id')['total_skill_upgrades'].sum().reset_index()
    abilities_agg.columns = ['match_id', 'total_skill_upgrades']
    combined_df = combined_df.merge(abilities_agg, on='match_id', how='left')

# Create Data_Combined folder if it doesn't exist
import os
os.makedirs("Data_Combined", exist_ok=True)

# Save combined dataframe to Data_Combined folder
combined_df.to_csv("Data_Combined/dota_pro_combined.csv", index=False)
print(f"\n‚úÖ Combined CSV saved to Data_Combined folder! Shape: {combined_df.shape}")
print(f"Columns: {len(combined_df.columns)}")
print(f"\nFirst few columns: {list(combined_df.columns[:10])}...")



‚úÖ Combined CSV saved! Shape: (15000, 65)
Columns: 65

First few columns: ['match_id', 'match_seq_num', 'patch', 'region', 'start_time', 'duration', 'game_mode', 'skill', 'first_blood_time', 'barracks_status_dire']...
