In [1]:
from tqdm import tqdm
import numpy as np
import pandas as pd

In [2]:
from lxml import html
import requests
from lxml.cssselect import CSSSelector

# Считывание информации

## Top-20 команд по версии hltv.org

In [3]:
site_address = 'http://www.hltv.org'
teams_rankings = requests.get(site_address + '/ranking/teams')
teams_rankings_html = html.fromstring(teams_rankings.content)

In [None]:
years = teams_rankings_html.xpath('//div[@class="tab_group1"][1]//a')

In [4]:
teams_rankings_page_hrefs = []

for year in years:
    year_href = year.get('href')
    year_page = requests.get(site_address + year_href)
    year_page_html = html.fromstring(year_page.content)
    year_monthes = year_page_html.xpath('//div[@class="tab_group1"][2]//a')
    
    for month in year_monthes:
        month_href = month.get('href')
        month_page = requests.get(site_address + month_href)
        month_page_html = html.fromstring(month_page.content)
        month_days = month_page_html.xpath('//div[@class="tab_group1"][3]//a')
        teams_rankings_page_hrefs += [day.get('href') for day in month_days]

In [5]:
teams_rankings_page_hrefs = [site_address + page_href for page_href in teams_rankings_page_hrefs]

In [6]:
href = 'http://www.hltv.org/ranking/teams/2015/october/1/'
ranking_page = requests.get(href)
ranking_page_html = html.fromstring(ranking_page.content)
teams_statistic_hrefs = ranking_page_html.xpath('//div[@class="ranking-teamStatistics"]//a')
teams_lineups_html = ranking_page_html.xpath('//div[@class="ranking-lineup"]')
teams = [href.get('href') for href in teams_statistic_hrefs]
lineups = [[href.get('href') for href in lineup_html.xpath('.//div[@class="ranking-playerNick"]//a')] 
           for lineup_html in teams_lineups_html]
lineups = lineups

In [7]:
[href.get('href') for href in teams_statistic_hrefs]

['/?pageid=362&teamid=4991',
 '/?pageid=179&teamid=5996',
 '/?pageid=362&teamid=5991',
 '/?pageid=362&teamid=5378',
 '/?pageid=362&teamid=4608',
 '/?pageid=362&teamid=4411',
 '/?pageid=362&teamid=5752',
 '/?pageid=362&teamid=5995',
 '/?pageid=362&teamid=5422',
 '/?pageid=362&teamid=4494',
 '/?pageid=362&teamid=5974',
 '/?pageid=362&teamid=6290',
 '/?pageid=362&teamid=5988',
 '/?pageid=362&teamid=5973',
 '/?pageid=179&teamid=5284',
 '/?pageid=179&teamid=6226',
 '/?pageid=362&teamid=5310',
 '/?pageid=362&teamid=6211',
 '/?pageid=179&teamid=6375',
 '/?pageid=179&teamid=6372']

In [8]:
lineups

[['/player/41-pronax',
  '/player/885-olofmeister',
  '/player/3055-flusha',
  '/player/3849-jw',
  '/player/7528-krimz'],
 ['/player/429-karrigan',
  '/player/2469-cajunb',
  '/player/4954-xyp9x',
  '/player/7398-dupreeh',
  '/player/7592-device'],
 ['/player/4959-kioshima',
  '/player/7167-kennys',
  '/player/7168-nbk',
  '/player/7322-apex',
  '/player/7429-happy'],
 ['/player/161-taz',
  '/player/165-neo',
  '/player/317-pashabiceps',
  '/player/2553-snax',
  '/player/5386-byali'],
 ['/player/483-edward',
  '/player/484-zeus',
  '/player/2757-guardian',
  '/player/3347-seized',
  '/player/7594-flamie'],
 ['/player/29-f0rest',
  '/player/39-get-right',
  '/player/695-allu',
  '/player/884-xizt',
  '/player/7148-friberg'],
 ['/player/203-n0thing',
  '/?pageid=173&playerid=1916',
  '/player/7440-skadoodle',
  '/player/7808-freakazoid',
  '/player/8349-shroud'],
 ['/player/629-fox',
  '/player/1045-maikelele',
  '/player/1146-dennis',
  '/player/7390-scream',
  '/player/8183-rain'],
 [

## Match stats

### matches pages

In [4]:
matches_listing_address = '?pageid=188&offset='
offsets = list(range(0, 50*570, 50))

In [33]:
matches_pages_hrefs = []
for offset in tqdm(offsets):
    page = requests.get(site_address + matches_listing_address + str(offset))
    page_html = html.fromstring(page.content)
    matches_pages_hrefs += page_html.xpath('//div[@class="centerNoHeadline"]//div[@class="covMainBoxContent"]//a[1]')
    
matches_pages_links = [href.get('href') for href in matches_pages_hrefs]

100%|████████████████████████████████████████| 570/570 [13:34<00:00,  1.32s/it]


In [157]:
if 1==0:
    with open('matches_pages_links.txt', 'w') as file:
        for line in matches_pages_links:
            file.write("{}\n".format(line))

In [4]:
with open('matches_pages_links.txt', 'r') as file:
    matches_pages_links = file.read().splitlines()

### match stats

In [5]:
def get_match_teams(match_page_html):
    match_teams = match_page_html.xpath('//div[@class="covGroupBoxContent"]//div[@class="covSmallHeadline"][1]//a')
    teams_links = [link.get('href') for link in match_teams]
    return [['team one', teams_links[0]], ['team two', teams_links[1]]]

In [6]:
def href_if_link_else_text(xpath_node):
    links = xpath_node.xpath('.//a')
    if links:
        return links[0].get('href')
    else:
        return xpath_node.text_content()

def three_stats_to_two(stats):
    stats = [[[stat[0] + ' player', stat[1]], [stat[0], stat[2]]]for stat in stats]
    return [list(stat) for stat in np.concatenate(stats)]
    
def get_match_main_stats(match_page_html):
    match_stats = match_page_html.xpath('//div[@class="covGroupBoxContent"]//div[@class="covSmallHeadline"]')
    teams_stats = match_stats[1:17]
    players_stats = match_stats[18:]
    teams_stats = zip(teams_stats[0::2], teams_stats[1::2])
    players_stats = zip(players_stats[0::3], players_stats[1::3], players_stats[2::3])
    teams_stats = [[href_if_link_else_text(value) for value in stats] for stats in teams_stats]
    players_stats = [[href_if_link_else_text(value) for value in stats] for stats in players_stats]
    players_stats = three_stats_to_two(players_stats)
    
    return teams_stats + players_stats

In [7]:
def get_match_players_stats(match_page_html):
    stats = match_page_html.xpath('//div[@class="covMainBoxContent"]//div[@class="covSmallHeadline"]')
    start_index = [node.text_content() for node in stats].index('Raw stats')
    stats = stats[start_index:]
    if len(stats)//10 == len(stats)/10:
        players_stats = zip(*(stats[i::10] for i in range(10)))
    else:
        players_stats = zip(*(stats[i::9] for i in range(9)))
    
    col_names, *players_stats = [[href_if_link_else_text(value) for value in stats] for stats in players_stats]
    
    stats = [[[col_name + ' p_' + str(i), stat] for col_name, stat in zip(col_names, player_stats)] 
             for i, player_stats in enumerate(players_stats)]
    return [list(stat) for stat in np.concatenate(stats)]

In [8]:
def get_match_stats(match_page_html):
    teams = get_match_teams(match_page_html)
    main_stats = get_match_main_stats(match_page_html)
    players_stats = get_match_players_stats(match_page_html)
        
    return dict(teams + main_stats + players_stats)

In [9]:
matches_stats = []
for link in tqdm(matches_pages_links):
    match_page_link = site_address + link
    flag = False
    
    while not flag:
        try:
            match_page = requests.get(match_page_link)
            flag = True
        except:
            pass
    
    match_page_html = html.fromstring(match_page.content)
    matches_stats.append(get_match_stats(match_page_html))

100%|██████████████████████████████████| 28500/28500 [4:35:13<00:00,  2.42it/s]


In [10]:
stats_lens = [len(stats) for stats in matches_stats]
stats_lens_unique = np.unique(stats_lens)
[(length, stats_lens.count(length)) for length in stats_lens_unique]

[(100, 5),
 (110, 12228),
 (112, 5),
 (119, 179),
 (122, 15944),
 (128, 14),
 (132, 119),
 (137, 4),
 (142, 2)]

## Storing stats to csv

In [11]:
df = pd.DataFrame([stats for stats in matches_stats if len(stats)==110 or len(stats)==122])
df['match page'] = [matches_pages_links[i] for i, stats in enumerate(matches_stats) if len(stats)==110 or len(stats)==122]

In [12]:
df.head(5)

Unnamed: 0,A p_0,A p_1,A p_2,A p_3,A p_4,A p_5,A p_6,A p_7,A p_8,A p_9,...,Team p_4,Team p_5,Team p_6,Team p_7,Team p_8,Team p_9,Team rating,team one,team two,match page
0,3,6,1,0,1,3,5,2,4,1,...,/?pageid=179&teamid=7397,/?pageid=179&teamid=4688,/?pageid=179&teamid=4688,/?pageid=179&teamid=4688,/?pageid=179&teamid=4688,/?pageid=179&teamid=7397,0.81 : 1.20,/?pageid=179&teamid=4688,/?pageid=179&teamid=7397,/?pageid=188&matchid=46149
1,3,3,4,4,4,2,4,6,2,2,...,/?pageid=179&teamid=4688,/?pageid=179&teamid=4688,/?pageid=179&teamid=4688,/?pageid=179&teamid=7397,/?pageid=179&teamid=4688,/?pageid=179&teamid=4688,1.27 : 0.73,/?pageid=179&teamid=7397,/?pageid=179&teamid=4688,/?pageid=188&matchid=46147
2,3,3,7,3,5,6,6,2,5,4,...,/?pageid=179&teamid=7897,/?pageid=179&teamid=7897,/?pageid=179&teamid=6745,/?pageid=179&teamid=6745,/?pageid=179&teamid=6745,/?pageid=179&teamid=6745,0.84 : 1.11,/?pageid=179&teamid=6745,/?pageid=179&teamid=7897,/?pageid=188&matchid=46148
3,5,3,4,4,5,3,3,4,0,3,...,/?pageid=179&teamid=7897,/?pageid=179&teamid=7897,/?pageid=179&teamid=6745,/?pageid=179&teamid=6745,/?pageid=179&teamid=6745,/?pageid=179&teamid=6745,1.21 : 0.78,/?pageid=179&teamid=7897,/?pageid=179&teamid=6745,/?pageid=188&matchid=46146
4,5,4,2,3,4,5,0,1,5,4,...,/?pageid=179&teamid=5973,/?pageid=179&teamid=5973,/?pageid=179&teamid=5752,/?pageid=179&teamid=5752,/?pageid=179&teamid=5752,/?pageid=179&teamid=5752,0.79 : 1.20,/?pageid=179&teamid=5752,/?pageid=179&teamid=5973,/?pageid=188&matchid=46145


In [24]:
df.to_csv('data.csv', index=False)

In [22]:
pd.read_csv('data.csv', index_col=False).shape

(5, 123)

In [23]:
df.head(5).shape

(5, 123)