In [8]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# Initialize the driver
options = webdriver.ChromeOptions()
options.add_argument('--start-maximized')  # Ensure the page loads fully
driver = webdriver.Chrome(options=options)

try:
    # Navigate to the heroes page
    heroes_url = "https://www.dota2.com/heroes"
    driver.get(heroes_url)

    # Initialize a set to store unique hero links
    hero_links = set()

    # Keep scrolling until all hero links are found
    while True:
        # Find all hero link elements
        current_hero_elements = driver.find_elements(By.CSS_SELECTOR, "a[href*='/hero/']")
        current_links = {elem.get_attribute("href") for elem in current_hero_elements}
        
        # Add the current links to the set
        previous_count = len(hero_links)
        hero_links.update(current_links)

        # If no new links are added, assume we've scrolled to the bottom
        if len(hero_links) == previous_count:
            break

        # Scroll down by a large increment
        driver.execute_script("window.scrollBy(0, 1000);")
        time.sleep(1)  # Give time for dynamic loading

    # Convert set to a sorted list for consistent output
    hero_links = sorted(hero_links)

    # Print all hero links
    print(f"Collected {len(hero_links)} hero links:")
    #for link in hero_links:
        #print(link)

except Exception as e:
    print(f"An error occurred: {e}")

finally:
    driver.quit()


Collected 126 hero links:


In [21]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# Initialize the driver (adjust the executable_path to your ChromeDriver location)
driver = webdriver.Chrome()

# Dictionary to store all hero information
heroes_info = {}

def wait_with_retry(driver, timeout, locator):
    retries = 5
    for _ in range(retries):
        try:
            return WebDriverWait(driver, timeout).until(EC.presence_of_element_located(locator))
        except TimeoutException:
            print("Retrying...")
            driver.refresh()  # Refresh the page if timeout occurs
    raise TimeoutException("Element not found after retries")


try:
    # Iterate over each hero link
    for hero_url in hero_links: 
        driver.get(hero_url)
        WebDriverWait(driver, 15).until(
        lambda d: d.execute_script("return document.readyState") == "complete") # wait untill the page loads
        hero_data = {}
        
        # Scroll to the hero's general details
        try:
            WebDriverWait(driver, 20).until(
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            WebDriverWait(driver, 20).until(
                lambda d: d.execute_script("return document.readyState") == "complete"
            )
            
            # Get hero attribute (Strength/Agility/Intelligence/Universal)
            attribute_element = wait_with_retry(driver, 100, (By.CLASS_NAME, "_3HGWJjSyOjmlUGJTIlMHc_"))
            driver.execute_script("arguments[0].scrollIntoView(true);", attribute_element)
            WebDriverWait(driver, 10).until(EC.visibility_of(attribute_element))
            hero_data['Attribute'] = attribute_element.text.strip()
            
            # Get hero name
            hero_name = wait_with_retry(driver, 100, (By.CLASS_NAME, "_2IcIujaWiO5h68dVvpO_tQ"))
            hero_name_text = hero_name.text.strip()
            driver.execute_script("arguments[0].scrollIntoView(true);", hero_name)
            WebDriverWait(driver, 10).until(EC.visibility_of(hero_name))
            #print(f"hero name: {hero_name_text}")

            
            
            # Get attack type (Melee/Ranged)
            try:
                attack_type_element = wait_with_retry(driver, 100, (By.CLASS_NAME, "_3ce-DKDrVB7q5LsGbJdZ3X")) 
                driver.execute_script("arguments[0].scrollIntoView(true);", attack_type_element)
                WebDriverWait(driver, 10).until(EC.visibility_of(attack_type_element))  
                hero_data['Attack Type'] = attack_type_element.text.strip()
            except Exception:
                hero_data['Attack Type'] = "Melee"  # Default to Melee if element not found
                print(f"Failed to get attack type {hero_url}: {e}")

            # Get hero hardness
            try:
                hardness_container = wait_with_retry(driver, 100, (By.CLASS_NAME, "_1k-L3kCa-AwQVlB-EScp_y"))
                driver.execute_script("arguments[0].scrollIntoView(true);", hardness_container)
                WebDriverWait(driver, 10).until(EC.visibility_of(hardness_container))  
                hardness_subclasses = hardness_container.find_elements(By.XPATH, "./div")
                base_class = hardness_subclasses[0].get_attribute("class")
                hardness = sum(1 for subclass in hardness_subclasses if subclass.get_attribute("class") == base_class)
                hero_data['Hardness Level'] = hardness
            except Exception:
                hero_data['Hardness Level'] = "N/A"
                print(f"Failed to get hardness level {hero_url}: {e}")
    
            
            # Scroll to and get role distribution
            try:
                roles_container = wait_with_retry(driver, 100, (By.CLASS_NAME, "_2muZpdcl1nAAKmMRxKMMAI"))
                driver.execute_script("arguments[0].scrollIntoView(true);", roles_container)
                
                # Wait for roles container to become visible
                WebDriverWait(driver, 10).until(
                    EC.visibility_of(roles_container)
                )

                role_elements = driver.find_elements(By.CLASS_NAME, "_3zWGygZT2aKUiOyokg4h1v")
                roles = {}
                for role_element in role_elements:
                    role_name = role_element.find_element(By.CLASS_NAME, "_3Fbk3tlFp8wcznxtXIx19W").text
                    percentage_style = role_element.find_element(By.CLASS_NAME, "f7kjDBQOuPqiwaCTUPzLJ").get_attribute("style")
                    percentage = percentage_style.split("width: ")[1].replace(";", "").strip()
                    roles[role_name] = percentage
                hero_data['Roles'] = roles
            except Exception as e:
                hero_data['Roles'] = "N/A"
                print(f"Failed to get roles for {hero_url}: {e}")


            # Count active skills
            try:
                active_skill_count = 0
                skill_type = wait_with_retry(driver, 100, (By.CLASS_NAME, "_1gAlzFjUFFNhRj02gbrPLW"))
                driver.execute_script("arguments[0].scrollIntoView(true);", skill_type)
                WebDriverWait(driver, 10).until(EC.visibility_of(skill_type))  
                if "Passive" not in skill_type.text:
                    active_skill_count += 1
                skill_parent_elements = driver.find_elements(By.CLASS_NAME, "_1vjw5Sik8Zewkj5_iOhCUb")  # Replace this class
                for skill_parent in skill_parent_elements:
                    sub_elements = skill_parent.find_elements(By.XPATH, "./div")
                    for sub in sub_elements:
                        if sub.get_attribute("class") == "_3Chop4A9yz7Af_BwR1r_NW _3SHwx10zrkJ8dzvQnkI2h8": 
                            driver.execute_script("arguments[0].click();", sub)
                            skill_type = driver.find_element(By.CLASS_NAME, "_1gAlzFjUFFNhRj02gbrPLW")  # Replace for skill type
                            if "Passive" not in skill_type.text:
                                active_skill_count += 1
                hero_data['Active Skills'] = active_skill_count
            except Exception as e:
                print(f"Fail to get active skills: {e} on {hero_url}")

            
            heroes_info[hero_name_text] = hero_data

        except Exception as e:
            print(f"Error1: {e} on {hero_url}")

        time.sleep(1)

    # Print the first hero data as an example
    if heroes_info:
        first_hero = next(iter(heroes_info))
        print(f"\nHero: {first_hero}")
        for key, value in heroes_info[first_hero].items():
            print(f"  {key}: {value}")

finally:
    driver.quit()


Hero: ABADDON
  Attribute: UNIVERSAL
  Attack Type: MELEE
  Hardness Level: 1
  Roles: {'Carry': '33.3%', 'Support': '66.6%', 'Nuker': '0%', 'Disabler': '0%', 'Jungler': '0%', 'Durable': '66.6%', 'Escape': '0%', 'Pusher': '0%', 'Initiator': '0%'}
  Active Skills: 3


In [23]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import os 
import requests

# Setup Selenium WebDriver
driver = webdriver.Chrome()  # Make sure you have ChromeDriver installed
image_dir = "data/hero_images"
os.makedirs(image_dir, exist_ok=True)

# Dictionary to store hero image URLs
hero_images = {}

# Scrape hero images
for hero_url in hero_links:
    try:
        driver.get(hero_url)
        time.sleep(2)  # Wait for the page to load completely
        soup = BeautifulSoup(driver.page_source, "html.parser")
        
        # Find the image tag for the hero
        image_tag = soup.find("img", class_="CR-BbB851VmrcN5s9HpGZ")  # Adjust the class name as needed
        if image_tag and "src" in image_tag.attrs:
            img_url = image_tag["src"]
            if img_url.startswith("//"):
                img_url = "https:" + img_url  # Add protocol if missing
            hero_name = hero_url.split("/")[-1]  # Extract hero name from the URL
            hero_images[hero_name] = img_url  # Store hero name and image URL
            #print(f"Found image for {hero_name}: {img_url}")
        else:
            print(f"No image found for {hero_url}")
    except Exception as e:
        print(f"Error processing {hero_url}: {e}")

# Download hero images
counter = 0
for hero_name, img_url in hero_images.items():
    try:
        img_data = requests.get(img_url).content
        with open(os.path.join(image_dir, f"{hero_name}.png"), "wb") as f:
            f.write(img_data)
        counter += 1
        #print(f"Downloaded {hero_name}.png")
    except Exception as e:
        print(f"Error downloading {img_url} for {hero_name}: {e}")

print("Image download complete! ", counter, " images downloaded")

# Close the Selenium driver
driver.quit()


Image download complete!  126  images downloaded


In [30]:
import requests
url = "https://www.dota2.com/datafeed/herolist?language=english"
response = requests.get(url)
""""
print(response.status_code)
print(response.json().get("result", {}))
"""

# Convert to dictionary where id is the key and name is the value
heroes_dict = {hero['id']: hero['name_loc'] for hero in response.json().get("result", {})['data']['heroes']}


In [31]:
# Transform heroes_info based on heroes_dict by matching IDs with names
transformed_heroes_info = {}

# Iterate through heroes_info and map hero_name to its id using heroes_dict
for hero_name, hero_data in heroes_info.items():
    # Find hero_id using the reverse lookup in heroes_dict
    hero_id = next((k for k, v in heroes_dict.items() if v.lower() == hero_name.lower()), None)
    if hero_id is not None:
        # Add the hero name to the dictionary values
        hero_data['Name'] = hero_name
        # Use hero_id as the key in the transformed dictionary
        transformed_heroes_info[hero_id] = hero_data

# Print the transformed dictionary to verify
#for hero_id, data in transformed_heroes_info.items():
#   print(f"\nHero ID: {hero_id}")
#    for key, value in data.items():
#        print(f"  {key}: {value}")




In [32]:
len(transformed_heroes_info)

126

In [4]:
role_totals = {
    'Carry': 0.0,
    'Support': 0.0,
    'Nuker': 0.0,
    'Disabler': 0.0,
    'Jungler': 0.0,
    'Durable': 0.0,
    'Escape': 0.0,
    'Pusher': 0.0,
    'Initiator': 0.0
}

# Iterate through the heroes_info dictionary
for hero, data in heroes_info.items():
    roles = data.get('Roles', {})
    if roles != "N/A":  # Skip heroes with no role data
        for role, percentage in roles.items():
            # Convert percentage to a float and add to the total
            role_totals[role] += float(percentage.replace('%', ''))

# Print the totals for each role
for role, total in role_totals.items():
    print(f"{role}_total = {total}%")

Carry_total = 4428.900000000002%
Support_total = 2897.100000000001%
Nuker_total = 4761.900000000004%
Disabler_total = 4595.400000000004%
Jungler_total = 0.0%
Durable_total = 2797.2%
Escape_total = 3130.2000000000003%
Pusher_total = 1864.7999999999993%
Initiator_total = 3363.2999999999993%


## Roles in Dota 2 Simplified for Analysis

There are three basic roles in these types of games: **Damage**, **Support**, and **Tank**.

In Dota 2, the official website lists nine roles, but for simplicity, we will neglect some of them and combine others:

### Roles to Neglect
1. **Jungler**: Neglected since it is 0% for all heroes, and since there is no related metric to evaluate this role.
3. **Escape**: Neglected since we don't get any information about it without watching the game.

### Roles to Combine
1. **Carry** and **Nuker**: Combined since they both contribute to the hero damage of the team.
2. **Support**, **Disabler** and **Pusher**: Combined because their basic importance is to support the team.
3. **Durable** and **Initiator**: Combined since they both contribute to the tankiness of the team.

### Final Simplified Roles
1. **Damage**  (combination of 'Carry' and 'Nuker')
2. **Support** (combination of 'Support' and 'Disabler')  
3. **Tank** (combination of 'Durable' and 'Initiator')





In [33]:
import pandas as pd

# Function to safely convert percentage strings to float
def sanitize_percentage(value):
    try:
        # Remove '%' and any surrounding whitespace, then convert to float
        return float(value.replace('%', '').strip())
    except ValueError:
        # Return None for invalid values and log an error
        print(f"Invalid percentage value: '{value}'")
        return None

# Transform the dictionary into a flat structure
rows = []
for hero_id, hero_data in transformed_heroes_info.items():
    row = {
        'Hero ID': hero_id,
        'Name': hero_data['Name'],
        'Attribute': hero_data['Attribute'],
        'Attack Type': hero_data['Attack Type'],
        'Hardness Level': hero_data['Hardness Level'],
        'Active Skills': hero_data['Active Skills'],
    }
    # Add roles as individual columns
    total_val = 0.0
    roles_adjusted = {
            "Damage": 0,
            "Support": 0,
            "Tank": 0
        }
    for role, percentage in hero_data['Roles'].items():
        percentage_value = sanitize_percentage(percentage)
        #print(percentage_value)
        if role == "Carry" or role == "Nuker":
            roles_adjusted["Damage"] += percentage_value 
            total_val += percentage_value 
            #print(total_val)
        elif role == "Support" or role == "Disabler" or role == "Pusher":
            roles_adjusted["Support"] += percentage_value 
            total_val += percentage_value 
            #print(total_val)
        elif role == "Durable" or role == "Initiator":
            roles_adjusted["Tank"] += percentage_value 
            total_val += percentage_value 
            #print(total_val)
    #print(total_val)
    #print(hero_data['Name'])
    for key, val in roles_adjusted.items():
        roles_adjusted[key] = val / total_val

    combined_details = {**row, **roles_adjusted}  # Merge the two dictionaries
    rows.append(combined_details)

# Create a DataFrame
df = pd.DataFrame(rows)

# Save the DataFrame to a CSV file
output_dir = "data/"
os.makedirs(output_dir, exist_ok=True)
output_csv = os.path.join(output_dir, "heroes_info.csv")
df.to_csv(output_csv, index=False)

# Print the DataFrame to verify
print(df)

# Inform the user about the location of the saved file
print(f"\nCSV file saved to: {output_csv}")

     Hero ID                Name     Attribute Attack Type  Hardness Level  \
0        102             ABADDON     UNIVERSAL       MELEE               1   
1         73           ALCHEMIST      STRENGTH       MELEE               1   
2         68  ANCIENT APPARITION  INTELLIGENCE      RANGED               2   
3          1           ANTI-MAGE       AGILITY       MELEE               1   
4        113          ARC WARDEN       AGILITY      RANGED               3   
..       ...                 ...           ...         ...             ...   
121       21          WINDRANGER     UNIVERSAL      RANGED               2   
122      112       WINTER WYVERN     UNIVERSAL      RANGED               2   
123       30        WITCH DOCTOR  INTELLIGENCE      RANGED               1   
124       42         WRAITH KING      STRENGTH       MELEE               1   
125       22                ZEUS  INTELLIGENCE      RANGED               1   

     Active Skills    Damage   Support      Tank  
0           

### FETCHING 150 MACTHES PER HERO
- We do this in order create a average performance value for all heroes


In [48]:
from constants import API_KEY

In [None]:
import requests
import time
import json

# Initialize constants
ENDPOINT = "https://api.steampowered.com/IDOTA2Match_570/GetMatchHistoryBySequenceNum/v1/"
INITIAL_SEQ_NUM = 6794200794  # Starting sequence number, random value which makes sure all heroes are available (not that old sequence number)
MAX_HEROES = 126  # Total number of heroes
MAX_MATCHES_PER_HERO = 150  # Number of matches needed per hero
EARLIER_OFFSET = 100000  # Offset to go back in sequence numbers when no matches are found
DELAY_BETWEEN_REQUESTS = 2  # Delay between API calls in seconds

# Initialize hero match dictionary using transformed_heroes_info keys
hero_matches = {hero_id: [] for hero_id in transformed_heroes_info.keys()}

def fetch_matches(seq_num):
    """Fetch matches starting from a specific sequence number."""
    params = {
        "key": API_KEY,
        "start_at_match_seq_num": seq_num,
        "matches_requested": 100  # Fetch 100 matches at a time
    }
    response = requests.get(ENDPOINT, params=params)
    if response.status_code == 200:
        matches = response.json().get('result', {}).get('matches', [])
        valid_matches = []
        for match in matches:
            # Skip short matches
            if match.get('duration', 0) > 900: # we don't want matches with duration less than or equal to 15 minutes.
                valid_matches.append(match)
        return valid_matches        
    elif response.status_code == 429:
        #print("Rate limit hit. Waiting before retrying...")
        time.sleep(10)  # Wait before retrying
        return []
    else:
        #print(f"Error fetching matches: {response.status_code}")
        return []

def process_matches(matches, hero_matches, matches_to_fetch):
    """Process fetched matches and populate hero_matches dictionary."""
    for match in matches:
        match_seq_num = match.get('match_seq_num')
        players = match.get('players', [])
        for player in players:
            hero_id = player.get('hero_id')
            if hero_id and hero_id in hero_matches:
                if len(hero_matches[hero_id]) < MAX_MATCHES_PER_HERO:
                    hero_matches[hero_id].append(match_seq_num)
                    if match_seq_num not in matches_to_fetch:
                        matches_to_fetch.append(match_seq_num)

def all_heroes_collected(hero_matches):
    """Check if all heroes have the required number of matches."""
    return all(len(matches) >= MAX_MATCHES_PER_HERO for matches in hero_matches.values())

matches_to_fetch = []
# Start fetching matches
current_seq_num = INITIAL_SEQ_NUM
while not all_heroes_collected(hero_matches):
    #print(f"Fetching matches starting at sequence number: {current_seq_num}")
    matches = fetch_matches(current_seq_num)
    
    if not matches:  # No matches found
        #print("No more matches to fetch. Moving to an earlier sequence number.")
        current_seq_num -= EARLIER_OFFSET  # Go back in history
        continue

    # Process matches
    process_matches(matches, hero_matches, matches_to_fetch)

    # Update sequence number to fetch the next batch
    current_seq_num = matches[-1].get('match_seq_num') + 1 if matches else current_seq_num

    # Add delay to avoid hitting API rate limits
    #print(f"Waiting {DELAY_BETWEEN_REQUESTS} seconds before the next request...")
    time.sleep(DELAY_BETWEEN_REQUESTS)

# Print results
for hero_id, match_seq_nums in hero_matches.items():
    print(f"Hero ID {hero_id}: {len(match_seq_nums)} matches collected")

# Save hero_matches dictionary to a JSON file
output_dir = "data/raw_data"
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, "hero_matches.json")

with open(output_file, "w") as f:
    json.dump(hero_matches, f, indent=4)

print(f"Hero matches have been saved to {output_file}")



Fetching matches starting at sequence number: 6794200794
Waiting 2 seconds before the next request...
Fetching matches starting at sequence number: 6794200932
Waiting 2 seconds before the next request...
Fetching matches starting at sequence number: 6794201061
Waiting 2 seconds before the next request...
Fetching matches starting at sequence number: 6794201190
Waiting 2 seconds before the next request...
Fetching matches starting at sequence number: 6794201310
Waiting 2 seconds before the next request...
Fetching matches starting at sequence number: 6794201434
Waiting 2 seconds before the next request...
Fetching matches starting at sequence number: 6794201573
Waiting 2 seconds before the next request...
Fetching matches starting at sequence number: 6794201701
Waiting 2 seconds before the next request...
Fetching matches starting at sequence number: 6794201849
Waiting 2 seconds before the next request...
Fetching matches starting at sequence number: 6794201985
Waiting 2 seconds before 

In [63]:
# Initialize an empty list to store all sequence numbers
my_list = []

# Iterate over hero_matches dictionary
for hero_id, seq_nums in hero_matches.items():
    # Extend my_list with the sequence numbers for the current hero
    my_list.extend(seq_nums)

# Remove duplicates if needed
my_list = list(set(my_list))

# Print the collected sequence numbers
print(f"Collected {len(my_list)} unique sequence numbers.")

print(len(matches_to_fetch))


Collected 4690 unique sequence numbers.
4690


In [65]:
import requests
import json
import os

# Constants
BASE_URL = "https://api.steampowered.com/IDOTA2Match_570/GetMatchHistoryBySequenceNum/v1"
RAW_DATA_DIR = os.path.join("data/raw_data/heroes_matches_json")  # Directory to store JSON files
BATCH_SIZE = 100          # Matches per request
MATCHES_PER_FILE = 200     # Matches per JSON file

# Ensure raw_data directory exists
os.makedirs(RAW_DATA_DIR, exist_ok=True)

def fetch_match_by_sequence(sequence_num, api_key):
    """Fetch match data for a specific sequence number."""
    params = {
        "key": api_key,
        "start_at_match_seq_num": sequence_num,
        "matches_requested": 1  # Fetch one match
    }
    response = requests.get(BASE_URL, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching match {sequence_num}: {response.status_code}")
        return None

def save_batch_to_file(data_batch, file_index):
    """Save a batch of match data to a single JSON file."""
    file_name = os.path.join(RAW_DATA_DIR, f"heroes_matches_batch_{file_index}.json")
    with open(file_name, 'w') as f:
        json.dump(data_batch, f, indent=4)
    print(f"Saved {len(data_batch)} matches to {file_name}")

def main():
    current_batch = []
    file_index = 1

    for i, seq_num in enumerate(matches_to_fetch):
        #print(f"Fetching match for sequence number: {seq_num}")
        match_data = fetch_match_by_sequence(seq_num, API_KEY)
        
        if match_data and "result" in match_data and "matches" in match_data["result"]:
            current_batch.append(match_data["result"]["matches"][0])

        # Save the batch if it reaches the limit
        if len(current_batch) >= MATCHES_PER_FILE:
            save_batch_to_file(current_batch, file_index)
            current_batch = []  # Reset the batch
            file_index += 1

    # Save any remaining matches
    if current_batch:
        save_batch_to_file(current_batch, file_index)

if __name__ == "__main__":
    main()

Fetching match for sequence number: 6794200794
Fetching match for sequence number: 6794200795
Fetching match for sequence number: 6794200797
Fetching match for sequence number: 6794200799
Fetching match for sequence number: 6794200800
Fetching match for sequence number: 6794200801
Fetching match for sequence number: 6794200805
Fetching match for sequence number: 6794200806
Fetching match for sequence number: 6794200807
Fetching match for sequence number: 6794200808
Fetching match for sequence number: 6794200809
Fetching match for sequence number: 6794200810
Fetching match for sequence number: 6794200811
Fetching match for sequence number: 6794200812
Fetching match for sequence number: 6794200813
Fetching match for sequence number: 6794200814
Fetching match for sequence number: 6794200815
Fetching match for sequence number: 6794200816
Fetching match for sequence number: 6794200817
Fetching match for sequence number: 6794200819
Fetching match for sequence number: 6794200821
Fetching matc

In [None]:
import os
import json
import pandas as pd
import csv

carry_weights = {"W_k": 0.6, "W_a": 0.2, "W_d": 0.2}
support_weights = {"W_k": 0.3, "W_a": 0.6, "W_d": 0.1}
tank_weights = {"W_k": 0.4, "W_a": 0.5, "W_d": 0.1}


# Function to calculate role-specific performance
def calculate_role_performance(kills, assists, deaths, team_score, opponent_score, weights):
    return (
        (kills / team_score) * weights["W_k"] +
        (assists / team_score) * weights["W_a"] -
        (deaths / opponent_score) * weights["W_d"]
    )

hero_roles = {}
csv_file = "data/heroes_info.csv"
with open(csv_file, "r") as infile:
    reader = csv.DictReader(infile)
    for row in reader:
        roles = {
            "Damage": float(row["Damage"]),
            "Support": float(row["Support"]),
            "Tank": float(row["Tank"])
        }
        hero_roles[row["Hero ID"]] = roles
        
acc = []
# Constants
RAW_DATA_DIR = "data/raw_data/heroes_matches_json"  # Directory containing JSON files
CSV_OUTPUT_DIR = "data/parsed_data/heroes_matches_csv"    # Directory for output CSV files
#ACCOUNT_ID = 342924465         # Replace with your actual account_id

os.makedirs(CSV_OUTPUT_DIR, exist_ok=True)

def parse_json_to_csv(json_file, csv_file):
    """Parse matches from JSON and save relevant data to a CSV."""
    data = []  # Store the final player-level data
    
    # Read the JSON file
    with open(json_file, 'r') as f:
        matches = json.load(f)
    
    # Validate JSON format
    if not isinstance(matches, list):
        print(f"Invalid JSON format in {json_file}")
        return

    # Iterate through each match
    for match in matches:
        # Skip short matches
        if match.get("duration", 0) <= 900: # we don't want matches with duration less than or equal to 15 minutes.
            continue


        # Match-level details
        match_details = {
            "start_time": match.get("start_time"),
            "duration": match.get("duration"),
            "radiant_score": match.get("radiant_score"),
            "dire_score": match.get("dire_score"),
            "game_mode": match.get("game_mode"),
        }


        # Check if players exist
        players = match.get("players", [])
        if not players:
            continue
        match_seq_num = match.get("match_seq_num")
        # Find your player data
        team_num = 0 # just to initialiaze
        
        radiant_player_details = {}
        dire_player_details = {}
        counter_rad = 0    
        counter_dire = 0
        for player in players:
            team_num = player.get("team_number")
            kills = player.get("kills", 0)
            hero_id = player.get("hero_id", 0)
            deaths = player.get("deaths", 0)
            assists = player.get("assists", 0)
            radiant_score = match.get("radiant_score", 1)  # Prevent division by zero
            dire_score = match.get("dire_score", 1)
            team_score = match["radiant_score"] if team_num == 0 else match["dire_score"]
            opponent_score = match["dire_score"] if team_num == 0 else match["radiant_score"]
            level = player.get("level", 1)
            net_worth = player.get("net_worth", 0)
            tower_damage = player.get("tower_damage", 0)
            hero_damage = player.get("hero_damage", 0)

            # Calculate role-specific performances
            carry_performance = calculate_role_performance(kills, assists, deaths, team_score, opponent_score, carry_weights)
            support_performance = calculate_role_performance(kills, assists, deaths, team_score, opponent_score, support_weights)
            tank_performance = calculate_role_performance(kills, assists, deaths, team_score, opponent_score, tank_weights)

            # Calculate normalized performance
            normalized_performance = round((
                hero_roles[str(hero_id)]["Damage"] * carry_performance +
                hero_roles[str(hero_id)]["Support"] * support_performance +
                hero_roles[str(hero_id)]["Tank"] * tank_performance), 
                6
            )

            # Calculate performance and player_win
            performance = round(
                ((kills + assists) / (radiant_score if team_num == 0 else dire_score)) - 
                (deaths / (dire_score if team_num == 0 else radiant_score)), 
                4
            )

            player_win = int(
                (match.get("radiant_win") and player.get("team_number") == 0) or
                (not match.get("radiant_win") and player.get("team_number") == 1)
            )
            if team_num == 0: # these players are allies, we add their hero_ids one by one right after my_player_details(applied in csv file) A1, A2 ... A4
                counter_rad += 1
                # Add to allies with keys like A1, A2, ..., A4
                radiant_player_details[f"A{counter_rad}_hero_id"] = player.get("hero_id")
                radiant_player_details[f"A{counter_rad}_kills"] = player.get("kills")
                radiant_player_details[f"A{counter_rad}_deaths"] = player.get("deaths")
                radiant_player_details[f"A{counter_rad}_assists"] = player.get("assists")
                radiant_player_details[f"A{counter_rad}_gold_per_min"] = player.get("gold_per_min")
                radiant_player_details[f"A{counter_rad}_xp_per_min"] = player.get("xp_per_min")
                radiant_player_details[f"A{counter_rad}_win"] = player_win
                radiant_player_details[f"A{counter_rad}_performance"] = performance
                radiant_player_details[f"A{counter_rad}_normalized_performance"] = normalized_performance
                radiant_player_details[f"A{counter_rad}_level"] = level
                radiant_player_details[f"A{counter_rad}_net_worth"] = net_worth
                radiant_player_details[f"A{counter_rad}_team_score"] = team_score
                radiant_player_details[f"A{counter_rad}_opponent_score"] = opponent_score
                radiant_player_details[f"A{counter_rad}_tower_damage"] = tower_damage
                radiant_player_details[f"A{counter_rad}_hero_damage"] = hero_damage


            elif team_num == 1: #these players are enemies, we add their hero_ids one by one right after my hero_id(applied in csv file) E1, E2 ... E5
                counter_dire += 1
                # Add to enemies with keys like E1, E2, ..., E5
                dire_player_details[f"E{counter_dire}_hero_id"] = player.get("hero_id")
                dire_player_details[f"E{counter_dire}_kills"] = player.get("kills")
                dire_player_details[f"E{counter_dire}_deaths"] = player.get("deaths")
                dire_player_details[f"E{counter_dire}_assists"] = player.get("assists")
                dire_player_details[f"E{counter_dire}_gold_per_min"] = player.get("gold_per_min")
                dire_player_details[f"E{counter_dire}_xp_per_min"] = player.get("xp_per_min")
                dire_player_details[f"E{counter_dire}_win"] = player_win
                dire_player_details[f"E{counter_dire}_performance"] = performance
                dire_player_details[f"E{counter_dire}_normalized_performance"] = normalized_performance
                dire_player_details[f"E{counter_dire}_level"] = level
                dire_player_details[f"E{counter_dire}_net_worth"] = net_worth
                dire_player_details[f"E{counter_dire}_team_score"] = team_score
                dire_player_details[f"E{counter_dire}_opponent_score"] = opponent_score
                dire_player_details[f"E{counter_dire}_tower_damage"] = tower_damage
                dire_player_details[f"E{counter_dire}_hero_damage"] = hero_damage
        
        combined_details = {**match_details, **radiant_player_details, ** dire_player_details}
        data.append(combined_details)
        
    # Create a DataFrame and save as CSV
    if data:
        df = pd.DataFrame(data)
        #print(df.shape)
        df.to_csv(csv_file, index=False)
        print(f"Saved matches with duration longer than 15 minutes, from {json_file} to {csv_file}")
    else:
        print(f"No data found in {json_file}")

def main():
    # Iterate through all JSON files in the raw data directory
    for json_file in os.listdir(RAW_DATA_DIR):
        if json_file.endswith(".json"):
            json_path = os.path.join(RAW_DATA_DIR, json_file)
            csv_path = os.path.join(CSV_OUTPUT_DIR, json_file.replace(".json", ".csv"))
            parse_json_to_csv(json_path, csv_path)

if __name__ == "__main__":
    main()


(200, 155)
Saved matches from data/raw_data/heroes_matches_json/heroes_matches_batch_20.json to data/parsed_data/heroes_matches_csv/heroes_matches_batch_20.csv
(200, 155)
Saved matches from data/raw_data/heroes_matches_json/heroes_matches_batch_16.json to data/parsed_data/heroes_matches_csv/heroes_matches_batch_16.csv
(200, 155)
Saved matches from data/raw_data/heroes_matches_json/heroes_matches_batch_1.json to data/parsed_data/heroes_matches_csv/heroes_matches_batch_1.csv
(200, 155)
Saved matches from data/raw_data/heroes_matches_json/heroes_matches_batch_17.json to data/parsed_data/heroes_matches_csv/heroes_matches_batch_17.csv
(200, 155)
Saved matches from data/raw_data/heroes_matches_json/heroes_matches_batch_21.json to data/parsed_data/heroes_matches_csv/heroes_matches_batch_21.csv
(200, 155)
Saved matches from data/raw_data/heroes_matches_json/heroes_matches_batch_10.json to data/parsed_data/heroes_matches_csv/heroes_matches_batch_10.csv
(200, 155)
Saved matches from data/raw_dat

In [27]:
import csv
import os
import glob

# Constants
INPUT_DIR = "data/parsed_data/heroes_matches_csv"  # Directory containing the CSV files
OUTPUT_CSV = "data/parsed_data/hero_average_performance.csv"  # Output CSV file
MAX_MATCHES_PER_HERO = 75  # Limit of matches per hero

# Initialize a dictionary to store hero performances
hero_performance = {}

# Iterate through all CSV files in the directory
csv_files = glob.glob(os.path.join(INPUT_DIR, "*.csv"))
for csv_file in csv_files:
    print(f"Processing file: {csv_file}")
    with open(csv_file, "r") as infile:
        reader = csv.DictReader(infile)
        
        # Process each row in the current CSV file
        for row in reader:
            for i in range(1, 6):  # Loop through E1, E2, ..., E5
                hero_id_key = f"E{i}_hero_id"
                performance_key = f"E{i}_normalized_performance"

                hero_id_key2 = f"A{i}_hero_id"
                performance_key2 = f"A{i}_normalized_performance"
                
                # Get the hero ID and performance
                hero_id = int(row[hero_id_key])
                performance = float(row[performance_key])

                hero_id2 = int(row[hero_id_key2])
                performance2 = float(row[performance_key2])
                
                # Initialize the list for the hero if not present
                if hero_id not in hero_performance:
                    hero_performance[hero_id] = []
                
                if hero_id2 not in hero_performance:
                    hero_performance[hero_id2] = []

                # Add the performance if the limit is not reached
                if len(hero_performance[hero_id]) < MAX_MATCHES_PER_HERO:
                    hero_performance[hero_id].append(performance)

                # Add the performance if the limit is not reached
                if len(hero_performance[hero_id2]) < MAX_MATCHES_PER_HERO:
                    hero_performance[hero_id2].append(performance2)

# Print the number of matches for each hero_id
heroes_with_not_enough_matches = {} # to check if there is a hero with less than 150 matches
for hero_id, performances in hero_performance.items():
    if len(performances) != MAX_MATCHES_PER_HERO:
        print(f"Hero ID {hero_id}: {len(performances)} matches")
        heroes_with_not_enough_matches[hero_id] = len(performances)


# Calculate the average performance for each hero
hero_average_normalized_performance = {
    hero_id: round(sum(performances) / len(performances), 5)
    for hero_id, performances in hero_performance.items()
    if performances  # Exclude heroes with no performance data
}

# Write the hero average performances to the output CSV
with open(OUTPUT_CSV, "w", newline="") as outfile:
    writer = csv.writer(outfile)
    # Write header
    writer.writerow(["Hero ID", "average_normalized_performance"])
    # Write hero performances
    for hero_id, avg_performance in hero_average_normalized_performance.items():
        writer.writerow([hero_id, avg_performance])

print(f"Hero average performances have been written to {OUTPUT_CSV}")


Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_24.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_1.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_18.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_19.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_2.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_3.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_7.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_22.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_23.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_6.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_4.csv
Processing file: data/parsed_data/heroes_matches_csv/heroes_matches_batch_21.csv
Processing file: data/parsed_data/

In [28]:
# Print the number of matches for each hero_id
heroes_with_not_enough_matches = {}
for hero_id, performances in hero_performance.items():
    if len(performances) != MAX_MATCHES_PER_HERO:
        print(f"Hero ID {hero_id}: {len(performances)} matches")
        heroes_with_not_enough_matches[hero_id] = len(performances)

In [29]:
# Create a dictionary to store the average performance for each hero
average_performance = {}

for hero_id, performances in hero_performance.items():
    if performances:  # Ensure there are performances for the hero
        avg_performance = sum(performances) / len(performances)
        average_performance[hero_id] = avg_performance

# Sort the heroes by their average performance in descending order
sorted_heroes = sorted(average_performance.items(), key=lambda x: x[1], reverse=True)

# Print the top 10 heroes with the best performances
print("Top 10 Heroes with Best Performances:")
for rank, (hero_id, avg_perf) in enumerate(sorted_heroes[:10], start=1):
    print(f"{rank}. Hero ID {hero_id}: Average Performance = {round(avg_perf, 5)}")


Top 10 Heroes with Best Performances:
1. Hero ID 37: Average Performance = 0.29489
2. Hero ID 55: Average Performance = 0.28208
3. Hero ID 87: Average Performance = 0.27725
4. Hero ID 137: Average Performance = 0.26885
5. Hero ID 40: Average Performance = 0.26858
6. Hero ID 91: Average Performance = 0.26624
7. Hero ID 85: Average Performance = 0.26469
8. Hero ID 33: Average Performance = 0.26323
9. Hero ID 71: Average Performance = 0.26225
10. Hero ID 51: Average Performance = 0.26182


## Creation of csv containing all relevant information about all heroes

In [7]:
import pandas as pd

# Paths to the CSV files
heroes_info_path = "data/heroes_info.csv"  # Path to heroes info CSV
heroes_performances_path = "data/hero_average_performance.csv"  # Path to heroes performances CSV

# Load the data into dataframes
heroes_info_df = pd.read_csv(heroes_info_path)
heroes_performances_df = pd.read_csv(heroes_performances_path)

# Merge the dataframes on 'Hero ID'
merged_df = pd.merge(heroes_performances_df, heroes_info_df, on="Hero ID", how="inner")

# Display the merged dataframe
print(merged_df.head())

# Save the merged dataframe to a CSV file if needed
output_path = "data/merged_heroes_data.csv"
merged_df.to_csv(output_path, index=False)

print(f"Merged data saved to {output_path}")

   Hero ID  average_performance         Name     Attribute Attack Type  \
0       34              0.37319       TINKER  INTELLIGENCE      RANGED   
1       84              0.38533    OGRE MAGI      STRENGTH       MELEE   
2      111              0.38417       ORACLE  INTELLIGENCE      RANGED   
3       42              0.36067  WRAITH KING      STRENGTH       MELEE   
4       72              0.33020   GYROCOPTER       AGILITY      RANGED   

   Hardness Level  Active Skills    Damage   Support      Tank  
0               2              4  0.666667  0.333333  0.000000  
1               1              3  0.250000  0.500000  0.250000  
2               3              4  0.375000  0.625000  0.000000  
3               1              3  0.222222  0.333333  0.444444  
4               1              4  0.800000  0.200000  0.000000  
Merged data saved to data/merged_heroes_data.csv


## Checking the performance metric

In [14]:
import pandas as pd

# Example hero data
hero_data = {
    "Hero ID": [1, 2, 3, 4, 5, 6, 7],
    "Name": ["Sniper", "Chaos Knight", "Crystal Maiden", "Pudge", "Invoker", "Anti-Mage", "Bristleback"],
    "Kills": [15, 8, 3, 12, 10, 20, 6],
    "Deaths": [5, 12, 8, 10, 6, 3, 8],
    "Assists": [8, 14, 20, 15, 12, 5, 18],
    "Team Score": [50, 45, 50, 60, 55, 50, 65],
    "Opponent Score": [48, 52, 46, 58, 50, 40, 62],
    "Carry": [1.0, 0.7, 0.0, 0.3, 0.5, 0.9, 0.4],
    "Support": [0.0, 0.0, 0.9, 0.3, 0.3, 0.0, 0.1],
    "Tank": [0.0, 0.3, 0.1, 0.4, 0.2, 0.1, 0.5],
}


# Convert to DataFrame
df = pd.DataFrame(hero_data)

# Role weights (normalized to sum to 1)
carry_weights = {"W_k": 0.6, "W_a": 0.2, "W_d": 0.2}
support_weights = {"W_k": 0.3, "W_a": 0.6, "W_d": 0.1}
tank_weights = {"W_k": 0.4, "W_a": 0.5, "W_d": 0.1}

# Calculate performance for each role
df["Carry Performance"] = (
    (df["Kills"] / df["Team Score"]) * carry_weights["W_k"] +
    (df["Assists"] / df["Team Score"]) * carry_weights["W_a"] -
    (df["Deaths"] / df["Opponent Score"]) * carry_weights["W_d"]
)
df["Support Performance"] = (
    (df["Kills"] / df["Team Score"]) * support_weights["W_k"] +
    (df["Assists"] / df["Team Score"]) * support_weights["W_a"] -
    (df["Deaths"] / df["Opponent Score"]) * support_weights["W_d"]
)
df["Tank Performance"] = (
    (df["Kills"] / df["Team Score"]) * tank_weights["W_k"] +
    (df["Assists"] / df["Team Score"]) * tank_weights["W_a"] -
    (df["Deaths"] / df["Opponent Score"]) * tank_weights["W_d"]
)

df["general performance"] = (
    (df["Kills"] / df["Team Score"])+
    (df["Assists"] / df["Team Score"]) -
    (df["Deaths"] / df["Opponent Score"])
)
# Calculate final performance
df["Normalized Performance"] = (
    df["Carry"] * df["Carry Performance"] +
    df["Support"] * df["Support Performance"] +
    df["Tank"] * df["Tank Performance"]
)

# Display results
print(df[["Hero ID", "Name", "Normalized Performance", "general performance"]])


   Hero ID            Name  Normalized Performance  general performance
0        1          Sniper                0.191167             0.355833
1        2    Chaos Knight                0.146991             0.258120
2        3  Crystal Maiden                0.237209             0.286087
3        4           Pudge                0.173586             0.277586
4        5         Invoker                0.150364             0.280000
5        6       Anti-Mage                0.240750             0.425000
6        7     Bristleback                0.133320             0.240199
