In [48]:
#import 

import pandas as pd
import numpy as np
import requests
import json
import bs4
import time
import os
from config import api_key

In [49]:
# display full dataframe
pd.set_option('display.max_columns', None)

In [50]:
#set ld2l season webpage
url = 'https://ld2l.gg/seasons/37/matches'

soup = bs4.BeautifulSoup(requests.get(url).text, 'html.parser')
matches = []

for a in soup.find_all('a', href=True):
    if 'match' in a['href'] and 'season' not in a['href']:
        matches.append('https://ld2l.gg' + a['href'])

#sort matches by ID
matches.sort(key=lambda x: int(x.split('/')[-1]))



In [51]:
# create a matches text file to store match IDs if it doesn't exist
if not os.path.exists('matches.txt'):
    with open('matches.txt', 'w') as f:
        f.write('')

In [52]:
# below code is for getting opendota links

od_matches = []

for match in matches:
    #check if match is already in file matches.txt to prevent re-scraping and angry butterygreg
    if match in open('matches.txt').read():
        pass
    else:
        with open('matches.txt', 'a') as f:
            f.write(match + '\n')
        soup = bs4.BeautifulSoup(requests.get(match).text, 'html.parser')
        for a in soup.find_all('a', href=True):
            if 'opendota' in a['href']:
                if 'matches/0' in a['href']:
                    break
            # get match id from end of url
                match_id = a['href'].split('/')[-1]
                od_matches.append(f"https://api.opendota.com/api/matches/{match_id}?api_key={api_key}")
                break



In [53]:
# hold list of file names
file_names = []

for files in os.listdir():
    if files.endswith('.json'):
        file_names.append(files)

for i, match in enumerate(od_matches):

    # get match id
    match_id = match.split('/')[-1].split('?')[0]

    #check if file already exists
    if os.path.isfile('match_' + match_id + '.json'):
        pass
    else:
        if match_id == '0':
            pass
        # get json of match and save to json file
        else:
            match_json = requests.get(match).json()
            with open('match_' + match_id + '.json', 'w') as f:
                json.dump(match_json, f)
                file_names.append('match_' + match_id + '.json')

In [54]:
# create an empty dataframe to hold all match data if it doesn't exist

if not os.path.exists('match_data.csv'):
    match_data = pd.DataFrame(columns=['match_id', 'date', 'week', 'account_id', 'personaname', 'teamID', 'rank_tier', 'kills', 'assists',
       'deaths', 'kills_per_min', 'kda', 'denies', 'gold', 'gold_per_min', 'gold_spent', 'hero_damage', 'damage_taken',
       'hero_healing', 'hero_id', 'item_0', 'item_1', 'item_2', 'item_3',
       'item_4', 'item_5', 'item_neutral', 'last_hits', 'level',
       'net_worth', 'tower_damage', 'xp_per_min', 'radiant_win',
       'duration', 'patch', 'isRadiant', 'win', 'lose',
       'total_gold', 'total_xp', 'obs_placed', 'sen_placed', 'rune_pickups', 
       'firstblood_claimed', 'pings', 'teamfight_participation', 'roshans_killed'])
    match_data.to_csv('match_data.csv')
else:
    match_data = pd.read_csv('match_data.csv', index_col=None)

In [55]:
for  i, file in enumerate(file_names):

    # read first json file as a dictionary
    with open(file) as f:
        data = json.load(f)

    # get match id
    match_id = data['match_id']

    # if match id is already in matches_df, skip
    if match_id in match_data['match_id'].values:
        pass
    else:

        rad_team_id = data['radiant_team_id']
        dire_team_id = data['dire_team_id']
        
    # read player from data into a dataframe

        df = pd.DataFrame(data['players'])

        # damage taken needs to be transformed. it is a nested dictionary and should be replaced with the sum of the values

        df['damage_taken'] = df['damage_taken'].apply(lambda x: sum(x.values()))

        #convert start_time from unix time to datetime using
        df['start_time'] = pd.to_datetime(df['start_time'], unit='s')
        df['date'] = df['start_time'].dt.date

        #games are played weekly. create a column for the week of the game. Week 1 starts on 2023-01-22, using isocalendar
        df['week'] = df['start_time'].dt.isocalendar().week - 2

        #drop start_time
        df.drop('start_time', axis=1, inplace=True)

        # if isRadiant is true, set teamID to radiant team ID, else set to dire team ID

        df['teamID'] = df['isRadiant'].apply(lambda x: rad_team_id if x == True else dire_team_id)

        new_order = ['match_id', 'date', 'week', 'account_id', 'personaname', 'teamID', 'rank_tier', 'kills', 'assists',
       'deaths', 'kills_per_min', 'kda', 'denies', 'gold', 'gold_per_min', 'gold_spent', 'hero_damage', 'damage_taken',
       'hero_healing', 'hero_id', 'item_0', 'item_1', 'item_2', 'item_3',
       'item_4', 'item_5', 'item_neutral', 'last_hits', 'level',
       'net_worth', 'tower_damage', 'xp_per_min', 'radiant_win',
       'duration', 'patch', 'isRadiant', 'win', 'lose',
       'total_gold', 'total_xp', 'obs_placed', 'sen_placed', 'rune_pickups', 
       'firstblood_claimed', 'pings', 'teamfight_participation', 'roshans_killed']

        df = df[new_order]

    # append to via concat

        match_data = pd.concat([match_data, df], axis=0)

        # save to csv every loop
        match_data.to_csv('match_data.csv', index=False)