In [1]:
#import 

import pandas as pd
import numpy as np
import requests
import json
import bs4
import time
import os
from pathlib import Path

# You will need your own API key from opendota 
# and a config.py file that sets the variable api_key to your key
from config import api_key

In [2]:
# display full dataframe
pd.set_option('display.max_columns', None)

#### Pulling season info

Basic match data is found on ld2l.gg/seasons/##/matches. My outline here will use season 37 for prototyping. 

Using BeautifulSoup (BS) (https://www.crummy.com/software/BeautifulSoup/bs4/doc/) the ld2l matches page is parsed to get the ld2l match id. 
The ld2l match id does not match the dota match id

A cache file is created, unless it already exists, to avoid re-parsing saved info.

Note that seasons on the website do not relate to the ticket or season that would be listed in Dota/OpenDota api.
This method also reduces adding in matches that ticket holders use for scrims or other reasons that aren't official games.
One limitation is that this method can't pull unticketed data, even if entered in completly on the ld2l website.



In [3]:
#set ld2l season webpage

season = 42

url = f'https://ld2l.gg/seasons/{season}/matches'


soup = bs4.BeautifulSoup(requests.get(url).text, 'html.parser')
matches = []

# This is a directory string to help manage the data of different seasons
save_dir = f'match_data_{season}'



In [4]:
# appends links to matches list

for a in soup.find_all('a', href=True):
    # Finds ld2l match links and appends them to a list
    if 'match' in a['href'] and 'season' not in a['href']:
        matches.append('https://ld2l.gg' + a['href'])

#sort matches by ID
matches.sort(key=lambda x: int(x.split('/')[-1]))

print(len(matches))

96


In [5]:
# create a folder to store match data if it doesn't exist
save_dir_path = Path(save_dir)
save_dir_path.mkdir(parents=True, exist_ok=True)

In [6]:
# matches text file location to variable
matches_file = f'{save_dir}/matches_{season}.txt'
print(matches_file)

match_data_42/matches_42.txt


In [7]:
# create a matches text file to store match IDs if it doesn't exist via touch command

matches_file_path = Path(matches_file)
matches_file_path.touch(exist_ok=True)

In [8]:
# write a file to save week number via touch command
if not os.path.exists(f'{save_dir}/week.txt'):
    os.system(f'touch {save_dir}/week.txt')

In [9]:
print(f'Found {len(matches)} matches in season {season} \nfile location: {matches_file}')

Found 96 matches in season 42 
file location: match_data_42/matches_42.txt


#### Converting to OpenDota links

After gathering the match data, each match page is opened via BS. From here the Match ID is extracted from the 
OpenDota link and a correctly formatted OpenDota API link is added to a list.

This section skips over matches that have been parsed already by checking the matches.txt file created in the last secion

In [10]:
# below code writes match IDs to file if they are not already in the file

with open(matches_file) as f:
    data = f.read()

for match in matches:
    #check if match is already in file matches.txt to prevent re-scraping and angry butterygreg
    if match not in data:
        #write match to file
        with open(matches_file, 'a') as f:
            f.write(match + '\n')


In [11]:
# add openDota link for each match

#read matches.txt into list
with open(matches_file) as f:
    data = f.read().split('\n')
    #remove empty string from list
    if data[-1] == '':
        data.pop()

# parse each match link via bs to get openDota link

opendota = []


for match in data:
    soup = bs4.BeautifulSoup(requests.get(match).text, 'html.parser')
    for a in soup.find_all('a', href=True):
        if 'opendota' in a['href'] and 'matches/0' not in a['href']:
            match_link = a['href']
            opendota.append(match_link)
            break


# convert opendota links to API links

opendota_api = []

for match in opendota:
    match_id = match.split('/')[-1]
    api_link = f'https://api.opendota.com/api/matches/{match_id}?api_key={api_key}'
    opendota_api.append(api_link)

In [12]:
print(open)

<function open at 0x000001627FA93A60>


#### Pulling OpenDota Jsons

In [13]:
# hold list of file names
file_names = []

for files in os.listdir(save_dir):
    if files.endswith('.json'):
        file_names.append(files)

for match in opendota_api:

    # get match id
    match_id = match.split('/')[-1].split('?')[0]

    file_path = f'{save_dir}/match_{match_id}.json'

    #check if file already exists
    if not os.path.isfile(f'{file_path}'):
        # get json of match and save to json file
        # request parse of match data
        # TODO: create a function to handle this
                      
        match_json = requests.get(match).json()
        with open(f'{save_dir}/match_{match_id}.json', 'w') as f:
            json.dump(match_json, f)
            file_names.append(f'match_{match_id}.json')

#### DataFrame formatting and basic cleaning

Below a blank dataframe is created with the selected features from the players section in the read json files. 
As with earlier sections, if a cached match_data.csv exists, new items will be concatenated instead of a new creations, saving time and resources.

#### If the data is not been parsed on opendota by requesting, the below will not function properly

In [14]:
# create an empty dataframe to hold all match data if it doesn't exist

# csv file location to variable

csv_file = f'{save_dir}/match_data_{season}.csv'

if not os.path.exists(csv_file):
    match_data = pd.DataFrame(columns=['match_id', 'date', 'account_id', 'personaname', 'teamID', 'rank_tier', 'kills', 'assists',
       'deaths', 'kills_per_min', 'kda', 'denies', 'gold', 'gold_per_min', 'gold_spent', 'hero_damage', 'damage_taken',
       'hero_healing', 'hero_id', 'item_0', 'item_1', 'item_2', 'item_3',
       'item_4', 'item_5', 'item_neutral', 'last_hits', 'level',
       'net_worth', 'tower_damage', 'xp_per_min', 'radiant_win',
       'duration', 'patch', 'isRadiant', 'win', 'lose',
       'total_gold', 'total_xp', 'obs_placed', 'sen_placed', 'rune_pickups', 'camps_stacked', 'stuns', 'creeps_stacked',
       'firstblood_claimed', 'pings', 'teamfight_participation', 'roshans_killed'])
    match_data.to_csv(csv_file, index=False)
else:
    match_data = pd.read_csv(csv_file, index_col=None)

In [15]:
match_data

Unnamed: 0,match_id,date,account_id,personaname,teamID,rank_tier,kills,assists,deaths,kills_per_min,kda,denies,gold,gold_per_min,gold_spent,hero_damage,damage_taken,hero_healing,hero_id,item_0,item_1,item_2,item_3,item_4,item_5,item_neutral,last_hits,level,net_worth,tower_damage,xp_per_min,radiant_win,duration,patch,isRadiant,win,lose,total_gold,total_xp,obs_placed,sen_placed,rune_pickups,camps_stacked,stuns,creeps_stacked,firstblood_claimed,pings,teamfight_participation,roshans_killed
0,7527115816,2024-01-07,1724857,Kenny Lavender,9302527,53,4,5,5,0.120482,1.50,14,406,471,14270,23440,12669,0,Sniper,63,116,174,75,598,236,2096,208,18,13486,35,571,False,1992,54,True,0,1,15637,18957,2,2,2,3,0.133262,4,0,29.0,0.750000,0
1,7527115816,2024-01-07,1111506,LINE,9302527,21,2,4,9,0.060241,0.60,4,212,355,11280,6290,22794,0,Anti-Mage,145,36,22,1122,569,63,574,181,16,10212,1246,426,False,1992,54,True,0,1,11786,14143,0,0,0,0,4.566504,0,0,1.0,0.500000,0
2,7527115816,2024-01-07,87164528,-GoldenDragon,9302527,34,1,9,10,0.030120,0.91,2,747,299,9455,11425,22145,0,Underlord,0,90,29,190,36,0,376,89,15,8747,568,374,False,1992,54,True,0,1,9926,12416,0,0,5,0,36.633762,0,0,1.0,0.833333,0
3,7527115816,2024-01-07,41736499,pehches,9302527,23,1,8,8,0.030120,1.00,2,1100,251,6540,9365,11915,93,Lich,229,180,188,218,69,0,375,28,13,8100,40,317,False,1992,54,True,0,1,8333,10524,8,13,2,0,18.033594,0,1,4.0,0.666667,0
4,7527115816,2024-01-07,341676433,roxyroro,9302527,53,3,4,6,0.090361,1.00,1,888,313,9590,13739,17834,0,Rubick,36,180,232,108,0,20,680,82,14,9368,171,355,False,1992,54,True,0,1,10391,11786,4,8,3,1,33.066803,2,0,1.0,0.583333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
755,7608154878,2024-02-25,1202456305,Vita Sociale,9298317,41,1,16,9,0.021552,1.70,0,3396,284,10090,12233,20621,125,Vengeful Spirit,180,229,21,0,254,36,675,59,21,12446,239,530,False,2784,54,False,1,0,13177,24592,11,20,2,0,51.933750,0,0,28.0,0.485714,0
756,7608154878,2024-02-25,254589017,CorruptCrimson,9298317,64,18,11,4,0.387931,5.80,4,12058,843,27765,59968,35338,0,Phantom Lancer,250,147,174,114,63,108,1156,558,28,37933,13904,1076,False,2784,54,False,1,0,39115,49926,1,1,5,0,0.000000,0,1,13.0,0.828571,0
757,7608154878,2024-02-25,344074048,Mj1ne,9298317,22,3,10,9,0.064655,1.30,1,3232,391,15380,10460,46423,2472,Omniknight,267,210,40,50,252,73,825,159,23,17522,2739,632,False,2784,54,False,1,0,18142,29324,0,0,4,1,0.000000,2,0,9.0,0.371429,0
758,7608154878,2024-02-25,137718020,ChiefSage Coco,9298317,44,10,5,6,0.215517,2.14,12,2335,526,21590,31012,29521,0,Outworld Destroyer,1,263,247,63,77,1806,309,256,24,22415,5073,728,False,2784,54,False,1,0,24406,33779,2,0,8,0,112.433350,0,0,2.0,0.428571,0


In [16]:
# heroes are stored as IDs instead of names. A new api call is needed to get the hero names. This will be stored in a dataframe

# check if hero names csv file exists
hero_names_file = Path('hero_names.csv')

if not hero_names_file.exists():
    heroes = requests.get(f'https://api.opendota.com/api/heroes?{api_key}').json()
    heroes_df = pd.DataFrame(heroes)
    heroes_df.to_csv(hero_names_file, index=False)
else:
    heroes_df = pd.read_csv(hero_names_file)

heroes_df.head()

Unnamed: 0,id,name,localized_name,primary_attr,attack_type,roles,legs
0,1,npc_dota_hero_antimage,Anti-Mage,agi,Melee,"['Carry', 'Escape', 'Nuker']",2
1,2,npc_dota_hero_axe,Axe,str,Melee,"['Initiator', 'Durable', 'Disabler', 'Jungler'...",2
2,3,npc_dota_hero_bane,Bane,int,Ranged,"['Support', 'Disabler', 'Nuker', 'Durable']",4
3,4,npc_dota_hero_bloodseeker,Bloodseeker,agi,Melee,"['Carry', 'Disabler', 'Jungler', 'Nuker', 'Ini...",2
4,5,npc_dota_hero_crystal_maiden,Crystal Maiden,int,Ranged,"['Support', 'Disabler', 'Nuker', 'Jungler']",2


In [17]:
games_with_errors = []

for  i, file in enumerate(file_names):

    #uncomment to see whcih files are loaded
    # print(f'{save_dir}/{file}')
    
    # read first json file as a dictionary
    with open(f'{save_dir}/{file}') as f:
        data = json.load(f)

    # get match id
    match_id = data['match_id']
    # if match id is already in matches_df, skip
    if match_id in match_data['match_id'].values:
        pass
    else:
        try:
            # check to see if match is valid

            rad_team_id = data['radiant_team_id']
            dire_team_id = data['dire_team_id']
            
            # read player from data into a dataframe

            df = pd.DataFrame(data['players'])

            #convert start_time from unix time to datetime using
            df['start_time'] = pd.to_datetime(df['start_time'], unit='s')
            df['date'] = df['start_time'].dt.date

            df['match_id'] = match_id

            #drop start_time
            df.drop('start_time', axis=1, inplace=True)

            # if isRadiant is true, set teamID to radiant team ID, else set to dire team ID

            df['teamID'] = df['isRadiant'].apply(lambda x: rad_team_id if x == True else dire_team_id)

            # damage_taken is a nested dictionary. We want the sum of the values in the dictionary. if damage_taken is empty, set to 0
            #df['damage_taken'] = df['damage_taken'].apply(lambda x: sum(x.values()) if x else 0)

            df['damage_taken'] = df['damage_taken'].apply(lambda x: sum(x.values()) if isinstance(x, dict) else 0)


            print(match_data.columns.tolist())
            print(df.columns.tolist())
            new_order = match_data.columns.tolist()

            df = df[new_order]

            #replace hero_id with hero name from heroes_df
            df['hero_id'] = df['hero_id'].map(heroes_df.set_index('id')['localized_name'])

            # append to main df via concat

            match_data = pd.concat([match_data, df], axis=0)

            # replace NaN with 0
            match_data.fillna(0, inplace=True)

            # save to csv every loop
            match_data.to_csv(csv_file, index=False)
        except Exception as e:
            print(end='\n\n')
            print(f'Error with match {match_id}')
            print(e, end='\n\n')
            games_with_errors.append(match_id)
            # remove match json file
            os.remove(f'{save_dir}/{file}')
            continue

7527115816
7527119500
7527121232
7527123622
7527124776
7527126476
7527166388
7527172879
7527182855
7527188964
7527206975
7538061655
7538098929
7538102352
7538106429
7538119346
7538119601
7538142780
7538147725
7538149847
7538151810
7538156988
7548898607
7548901412
7548902368
7548903495
7548905002
7548905794
7548941056
7548947740
7548948924
7548952464
7548963069
7548967240
7559363430
7559366561
7559366774
7559372440
7559376970
7559379959
7559407266
7559412484
7559418480
7559419009
7559425101
7559426789
7569959851
7569960493
7569960859
7569970858
7569996878
7570002228
7570007503
7570025266
7570028624
7594984084
7594985492
7594989855
7594993724
7594996940
7595021056
7595030536
7595038114
7595047319
7595050220
7608087042
7608088774
7608090905
7608094354
7608099485
7608100719
7608129062
7608136657
7608143642
7608145547
7608154878
7569961252
['match_id', 'date', 'account_id', 'personaname', 'teamID', 'rank_tier', 'kills', 'assists', 'deaths', 'kills_per_min', 'kda', 'denies', 'gold', 'gold_pe

#### Preview

Below will be a dataframe preview.

In [18]:
match_data

Unnamed: 0,match_id,date,account_id,personaname,teamID,rank_tier,kills,assists,deaths,kills_per_min,kda,denies,gold,gold_per_min,gold_spent,hero_damage,damage_taken,hero_healing,hero_id,item_0,item_1,item_2,item_3,item_4,item_5,item_neutral,last_hits,level,net_worth,tower_damage,xp_per_min,radiant_win,duration,patch,isRadiant,win,lose,total_gold,total_xp,obs_placed,sen_placed,rune_pickups,camps_stacked,stuns,creeps_stacked,firstblood_claimed,pings,teamfight_participation,roshans_killed
0,7527115816,2024-01-07,1724857,Kenny Lavender,9302527,53,4,5,5,0.120482,1.50,14,406,471,14270,23440,12669,0,Sniper,63,116,174,75,598,236,2096,208,18,13486,35,571,False,1992,54,True,0,1,15637,18957,2,2,2,3,0.133262,4,0,29.0,0.750000,0
1,7527115816,2024-01-07,1111506,LINE,9302527,21,2,4,9,0.060241,0.60,4,212,355,11280,6290,22794,0,Anti-Mage,145,36,22,1122,569,63,574,181,16,10212,1246,426,False,1992,54,True,0,1,11786,14143,0,0,0,0,4.566504,0,0,1.0,0.500000,0
2,7527115816,2024-01-07,87164528,-GoldenDragon,9302527,34,1,9,10,0.030120,0.91,2,747,299,9455,11425,22145,0,Underlord,0,90,29,190,36,0,376,89,15,8747,568,374,False,1992,54,True,0,1,9926,12416,0,0,5,0,36.633762,0,0,1.0,0.833333,0
3,7527115816,2024-01-07,41736499,pehches,9302527,23,1,8,8,0.030120,1.00,2,1100,251,6540,9365,11915,93,Lich,229,180,188,218,69,0,375,28,13,8100,40,317,False,1992,54,True,0,1,8333,10524,8,13,2,0,18.033594,0,1,4.0,0.666667,0
4,7527115816,2024-01-07,341676433,roxyroro,9302527,53,3,4,6,0.090361,1.00,1,888,313,9590,13739,17834,0,Rubick,36,180,232,108,0,20,680,82,14,9368,171,355,False,1992,54,True,0,1,10391,11786,4,8,3,1,33.066803,2,0,1.0,0.583333,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5,7569961252,2024-02-04,128623225,BlackOtter,9309818,15,3,7,6,0.105325,1.43,10,2203,382,7520,17098,23027,0,Invoker,267,73,36,29,40,534,331,116,16,9393,0,509,True,1709,54,False,0,1,10880,14498,2,5,8,0,74.501280,0,0,0.0,0.769231,0
6,7569961252,2024-02-04,50397057,off carry = me midas mael rad BF,9309818,42,2,5,9,0.070217,0.70,9,1379,393,9545,12469,27066,1642,Dawnbreaker,178,73,598,36,252,50,378,164,14,10064,162,393,True,1709,54,False,0,1,11193,11193,0,0,2,1,7.299414,3,0,2.0,0.538462,0
7,7569961252,2024-02-04,66836986,max,9309818,54,3,8,10,0.105325,1.00,6,1461,300,6190,25403,18314,1695,Warlock,36,180,188,218,254,0,589,67,12,6861,0,318,True,1709,54,False,0,1,8545,9057,3,10,2,1,8.800001,3,0,11.0,0.846154,0
8,7569961252,2024-02-04,42300571,Coplice,9309818,61,2,8,14,0.070217,0.67,0,494,283,6660,13280,22652,0,Nyx Assassin,201,244,36,42,180,0,297,41,13,7994,0,353,True,1709,54,False,0,1,8060,10054,2,5,1,0,58.900776,0,0,8.0,0.769231,0


In [19]:
print(games_with_errors)

# request parse of match with missing data.

for match in games_with_errors:
    response_url = f'https://api.opendota.com/api/request/{match_id}?api_key={api_key}'
    response = requests.post(response_url)
    print(response)


[]
