In [119]:
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
from datetime import datetime
import numpy as np
import aiohttp
import asyncio
from fake_useragent import UserAgent
import re
import json
from tqdm.notebook import tqdm

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


In [120]:
data = {'name_team_1': {},
        'rank_team_1': {},
        'ratio_team_1': {},
        'points_team_1':{},
        'prize_team_1': {},
        'players_team_1': {},
        'heroes_team_1':{},
        'name_team_2': {},
        'rank_team_2': {},
        'ratio_team_2': {},
        'points_team_2': {},
        'prize_team_2': {},
        'players_team_2':{},
        'heroes_team_2': {},
        'won_team_1': {},
        'data': {}
                   }
df = pd.DataFrame(data)

In [112]:
async def parse_match(url, session):
    global df
    try:
        headers = {
            'user-agent': UserAgent()['google_chrome']
        }

        async with session.get(url=url, headers=headers) as response:
            response_text = await response.text()
            soup = bs(response_text, 'lxml')
            matches = soup.find_all('section', class_=lambda x: ('boxed-section' in x.split())&('antispoiler-hide' in x.split()))

            teams = soup.find_all('div', class_='widget-team')
            teams_info = {}

            for i, team in enumerate(teams, start=1):
                team_name = team.find('div', class_='txt-content').text
                infos = team.find('ul', class_='widget-icon-info').find_all('li')
                rank =  infos[0].text.split(':')[1].strip('.')
                ratio = re.findall(r'\((.+?)\)', infos[1].text)[0].strip('%')
                points = re.sub(',', '', infos[2].text.split(':')[1])
                prize = re.sub(',', '', re.findall(r'\:(.+?)\€', infos[3].text)[0])

                rank = 0 if (rank == '-') or (rank == '---') else rank
                
                points = 0 if points == '---' else points

                teams_info[f'team_name_{i}'] = team_name
                teams_info[f'rank_{i}'] = rank
                teams_info[f'ratio_{i}'] = ratio
                teams_info[f'points_{i}'] = points
                teams_info[f'prize_{i}'] = prize

            for match in matches:
                who_won = match.find_all('div', class_='content-match-sub-team-titles')

                if who_won[0].find('i', class_='icon-winner'): won_left = 1
                else: won_left = 0
                
                time = soup.find('span', class_='tztime').get('data-time')
                team_picks = match.find_all('ul', class_='content-match-sub-picks')
                players = []
                heroes = []

                for team_pick in team_picks:
                    picks = team_pick.find_all('li')

                    for i, pick in enumerate(picks):
                        heroes += [pick.find('img').get('title')]
                        players += [pick.find('span').text]

                df = df.append({'name_team_1': teams_info['team_name_1'],
                                'rank_team_1': teams_info['rank_1'],
                                'ratio_team_1': teams_info['ratio_1'],
                                'points_team_1': teams_info['points_1'],
                                'prize_team_1': teams_info['prize_1'],
                                'players_team_1':players[:5],
                                'heroes_team_1':heroes[:5],
                                'name_team_2': teams_info['team_name_2'],
                                'rank_team_2': teams_info['rank_2'],
                                'ratio_team_2': teams_info['ratio_2'],
                                'points_team_2': teams_info['points_2'],
                                'prize_team_2': teams_info['prize_2'],
                                'players_team_2':players[5:],
                                'heroes_team_2':heroes[5:],
                                'won_team_1': won_left,
                                'data': datetime.utcfromtimestamp(int(time)).strftime('%Y-%m-%d')
                               }, ignore_index=True)

        
    except Exception as exc:
        print('exc')

In [113]:
async def parse_finished_matches(url):
    async with aiohttp.ClientSession() as session:
        response = await session.get(url=url, headers = headers)
        j = await response.json()
        soup =  bs(j['data'], "html.parser")
        matches = soup.find_all('tr', class_='finished')
        tasks = []
        for match in matches:
            link = match.find('a', class_='table-cell-container').get('href')
            await parse_match(link, session)


In [114]:
pack_of_urls = [[f'https://www.joindota.com/ajax/list_load?name=matches_finished&page={i}&a1=&devmode=1&language=en'
        for i in range(20*j + 1, 20*(j+1) + 1) ] for j in range(50)]

headers = {
        'user-agent': UserAgent()['google_chrome']
    }

In [121]:
for urls in tqdm(pack_of_urls):
    tasks = []
    for url in urls:
        task = parse_finished_matches(url)
        tasks.append(task)
    await asyncio.gather(*tasks)
    

  0%|          | 0/1 [00:00<?, ?it/s]

In [127]:
df = df.drop(df[df['heroes_team_1'].apply(lambda x: x == [])].index)
df = df.drop(df[df['heroes_team_1'].apply(lambda x: x[0] == '')].index)

In [130]:
df['rank_team_1'] = df['rank_team_1'].apply(lambda x: 0 if x == '---' else x)
df['rank_team_2'] = df['rank_team_2'].apply(lambda x: 0 if x == '---' else x)

In [131]:
df['rank_team_1'] = df['rank_team_1'].astype(int)
df['rank_team_2'] = df['rank_team_2'].astype(int)

df['points_team_1'] = df['points_team_1'].astype(int)
df['points_team_2'] = df['points_team_2'].astype(int)

df['ratio_team_1'] = df['ratio_team_1'].astype(int)
df['ratio_team_2'] = df['ratio_team_2'].astype(int)

df['prize_team_1'] = df['prize_team_1'].astype(int)
df['prize_team_2'] = df['prize_team_2'].astype(int)

In [33]:
df.to_csv('dota_matches_1k.csv')