In [373]:
import pprint as pp
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import simplejson as json
import time
import random
import os
import glob
import re
import unidecode
import datetime as dt
from collections import Counter
import sys
import pickle

### Get rider profile URLs from select races' startlists of last 10 years

In [None]:
user_agent = {'User-agent': 'Mozilla/5.0'}

def get_rider_urls(race_url,start_year,years_back):
    base_url = 'https://www.procyclingstats.com/race/' + str(race_url) + '/'
    print(str(race_url).upper(),'starting...')
    for year in range(int(start_year),int(start_year-(years_back+1)),-1):
        url = base_url + str(year) + '/startlist'
        response = requests.get(url, headers=user_agent)

        if response.status_code == 200:
            page = response.text
            soup = BeautifulSoup(page, 'lxml')
            startlist = soup.find_all('a', class_='rider blue ')
            urls = [x['href'] for x in startlist]
            urls = ['https://www.procyclingstats.com/' + u for u in urls]
            file = str(race_url) + '_' + str(year) + '.json'
            with open(file, 'w') as f:
                json.dump(urls, f)
            timer = 2 + 2 * random.random()
            print(year,'done, sleeping for',np.round(timer,2),'sec')
            time.sleep(timer)
        else:
            print('unsuccessful request!')
            print('status code:',response.status_code)
            break
    print('finished!\n')

**races chosen for scraping startlists:**
- 2007 to present
  - giro-d-italia
  - tour-de-france
- 2010 to present
  - vuelta-a-espana
- 2013 to present
  - strade-bianchi
  - paris-nice
  - milano-sanremo
  - gent-wevelgem
  - ronde-van-vlaanderen
  - paris-roubaix
  - amstel-gold-race
  - tour-of-california
  - liege-bastogne-liege
  - tour-de-suisse
  - il-lombardia

In [None]:
for race in races:
    get_rider_urls(race,2017,4)

### See how many unique riders there are across those races

In [23]:
path = 'startlists/'
unique_riders = []

for filename in glob.glob(os.path.join(path, '*.json')):
    with open(filename) as f:
        riders = json.load(f)
        unique_riders.extend(riders)

print('number of unique rider urls:',len(set(unique_riders)))

number of unique rider urls: 1731


In [407]:
unique_riders = list(set(unique_riders))
print(unique_riders[:10])

['https://www.procyclingstats.com/rider/alexander-edmondson', 'https://www.procyclingstats.com/rider/francois-parisien', 'https://www.procyclingstats.com/rider/valentin-baillifard', 'https://www.procyclingstats.com/rider/adrian-saez-de-arregi', 'https://www.procyclingstats.com/rider/alex-cano-ardila', 'https://www.procyclingstats.com/rider/steve-zampieri', 'https://www.procyclingstats.com/rider/jay-robert-thomson', 'https://www.procyclingstats.com/rider/anthony-perez', 'https://www.procyclingstats.com/rider/kevin-hulsmans', 'https://www.procyclingstats.com/rider/ryder-hesjedal']


### Download all season pages for each rider

In [524]:
start = 1681
end = 1731

print(dt.datetime.now(),'Downloading web pages...')
for rdx, first_url in enumerate(unique_riders[start:end]):
    # save first season and get list of remaining ones
    response = requests.get(first_url, headers=user_agent)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    seasons = [year.text for year in soup.find('ul', class_='horiztree').contents]
    save_name = first_url.split('/')[4] + '_' + seasons[0] + '.html'
    rdx_str = 'R[' + str(rdx+1) + ']'
    sdx_str = 'S[1]'
    
    with open(os.path.join('riders/',save_name), 'w') as file:
        file.write(str(soup))
    print(dt.datetime.now(),rdx_str,sdx_str,'Saved:',save_name,end='\r')
    
    random_wait = 2 + 2 * random.random()
    time.sleep(random_wait)
    
    # save the remaining seasons
    if len(seasons) > 1:
        remaining_seasons = seasons[1:len(seasons)]
        for sdx, year in enumerate(remaining_seasons):
            next_url = first_url + '&season=' + year
            response = requests.get(next_url, headers=user_agent)
            page = response.text
            soup = BeautifulSoup(page, 'lxml')
            save_name = save_name.split('_')[0] + '_' + year + '.html'
            sdx_str = 'S[' + str(sdx+2) + ']'
            
            with open(os.path.join('riders/',save_name), 'w') as file:
                file.write(str(soup))
            print(dt.datetime.now(),rdx_str,sdx_str,'Saved:',save_name,end='\r')
            
            random_wait = 2 + 2 * random.random()
            time.sleep(random_wait)
    else:
        continue
print(dt.datetime.now(),'Done')

2018-01-28 16:14:02.829642 Downloading web pages...
2018-01-28 16:46:08.609204 Done] S[7] Saved: tim-ariesen_2012.html.htmltml_2009.html


**estimates after first hour:**
- 15 pages per minute
- 10 pages per rider
- 1731 riders
- 17,310 total pages
- 900 pages per hour
- 19 hours total scraping time
- 16 pages per MB, total 1 GB

**final tally:**
- 17,806 pages
- 1.14 GB

**other races to consider if more riders are needed:**
- Vuelta a San Juan (Argentina)
- Tour Down Under (Australia)
- Dubai Tour (United Arab Emirates)
- Tirreno-Adriatico (Italy)

### Download annual PCS ranking pages for all riders

In [92]:
user_agent = {'User-agent': 'Mozilla/5.0'}
start_page = 'https://www.procyclingstats.com/rankings/me/pcs/individual'

response = requests.get(start_page, headers=user_agent)
page = response.text
soup = BeautifulSoup(page, 'lxml')

date_values = [item.text for item in soup.find_all('option', {'value' : re.compile('rankings.*\/[0-9]{4}-[0-9]{2}-[0-9]{2}')})]
date_values = [date for date in date_values if '-12-31' in date or '2018' in date]
page_values = [item['value'] for item in soup.find('select', {'name':'page', 'style':'padding: 1px; width: 204px;'}).find_all('option', {'value' : re.compile('^[0-9]{1,2}$')})]

In [93]:
print('annual ranking dates:',date_values)
print('\npage numbers for initial ranking date:',page_values)

annual ranking dates: ['2018-01-30', '2016-12-31', '2015-12-31', '2014-12-31', '2013-12-31', '2012-12-31', '2011-12-31', '2010-12-31', '2009-12-31', '2008-12-31', '2007-12-31', '2006-12-31', '2005-12-31']

page numbers for initial ranking date: ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27']


In [94]:
for date in date_values:
    save_url = start_page + '/' + date
    save_name = 'pcs_ranking_' + date + '_1.html'
    response = requests.get(save_url, headers=user_agent)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    
    with open(os.path.join('rankings/',save_name), 'w') as file:
        file.write(str(soup))
    print(dt.datetime.now(),'Saved:',save_name,end='\r')
    
    random_wait = 2 + 1 * random.random()
    time.sleep(random_wait)
    
    page_values = [item['value'] for item in soup.find('select', {'name':'page', 'style':'padding: 1px; width: 204px;'}).find_all('option', {'value' : re.compile('^[0-9]{1,2}$')})]
    if len(page_values) > 1:
        for page in page_values[1:]:
            save_url = start_page + '/' + date + '&page=' + page
            save_name = 'pcs_ranking_' + date + '_' + page + '.html'
            response = requests.get(save_url, headers=user_agent)
            page = response.text
            soup = BeautifulSoup(page, 'lxml')

            with open(os.path.join('rankings/',save_name), 'w') as file:
                file.write(str(soup))
            print(dt.datetime.now(),'Saved:',save_name,end='\r')

            random_wait = 2 + 1 * random.random()
            time.sleep(random_wait)
    else:
        continue
print('Finished!')

Finished!9 17:12:01.849824 Saved: pcs_ranking_2005-12-31_12.html


### Parsing riders' pages from saved files

In [475]:
# for filename in 'rider/' directory...
fname = 'aaron-olson_2000.html'
year = fname.split('_')[1].split('.')[0]

with open(os.path.join('riders/',fname), 'r') as file:
        page = file.read()
        soup = BeautifulSoup(page, 'lxml')

**Race Resume "table" containing datapoints**

In [277]:
rows = soup.find('div', class_='rdrRes').find_all('div', class_='row')

**Race Names per Season**

In [278]:
race_links = [c.find_all(['a'])[0] for c in rows]
race_names = [re.sub(r'(</?a[^>]*>|</?b[^>]*>)','',str(record)).encode('latin-1').decode('utf-8').strip() for record in race_links]
print('length of race names column:',len(race_names),'\n')
pp.pprint(race_names)

length of race names column: 1 

['National TT Championships New Zealand (NC)']


**_NOTE: Looks like there are 9 more records here compared to date, result, and points columns. Those are the stage race title rows that sit right above the stage race rollup rows..._**

**Date per Race**

In [279]:
race_dates = [date.text.replace(u'\xa0', u'').strip() for date in soup.find_all('span', style='width: 70px; ')]
print('length of race dates column:',len(race_dates),'\n')
print(race_dates)

length of race dates column: 1 

['09.01']


**Race Result (Rank) per Race**

In [280]:
results = [result.text for result in soup.find_all('span', style='width: 50px; text-align: center; ')]
print('length of results column:',len(results),'\n')
print(results,'\n')
results = [int(item) for item in results if item != 'DNS' if item != 'DNF']
print('new length of results column (removed DNS/DNF):',len(results),'\n')
print(results,'\n')
result_counts = Counter(results)
print('results frequencies:',result_counts,'\n')

top_10s = sum([result_counts[key] for key in result_counts.keys() if key <= 10])
top_3s = sum([result_counts[key] for key in result_counts.keys() if key <= 3])
print('number of top 10 finishes:',top_10s)
print('number of top 3 finishes:',top_3s)

length of results column: 1 

['13'] 

new length of results column (removed DNS/DNF): 1 

[13] 

results frequencies: Counter({13: 1}) 

number of top 10 finishes: 0
number of top 3 finishes: 0


**_NOTE: Will need to handle DNS and DNF when aggregating results and points._**

**PCS Points**

In [325]:
pcs_points = [point.text.replace(u'\xa0', u'') for point in soup.find_all('span', style='width: 80px;   ')][1::3]
pcs_points = [0 if item == '' else float(item) for item in pcs_points]
total_pcs_points = sum(pcs_points)

print('length of PCS points column:',len(pcs_points))
print('total PCS points:',total_pcs_points,'\n')
print(pcs_points)

length of PCS points column: 93
total PCS points: 82.0 

[0, 0, 5.0, 0, 50.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7.0, 0, 0, 5.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 0, 4.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7.0, 0]


**UCI Points**

In [324]:
uci_points = [point.text.replace(u'\xa0', u'') for point in soup.find_all('span', style='width: 80px;   ')][2::3]
uci_points = [0 if item == '' else float(item) for item in uci_points]
total_uci_points = sum(uci_points)

print('length of UCI points column:',len(uci_points),uci_points)
print('total UCI points:',total_uci_points,'\n')
print(uci_points)

length of UCI points column: 93 [0, 0, 0, 0, 6.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6.0, 0, 0, 5.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.0, 0, 6.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.4, 0]
total UCI points: 28.4 

[0, 0, 0, 0, 6.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6.0, 0, 0, 5.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4.0, 0, 6.0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1.4, 0]


**Number of Stage Races**

In [283]:
stage_race_dates = [daterange.text for daterange in soup.find_all('span', style='width: 190px; ')]
print('number of stage races:',len(stage_race_dates),'\n')
print(stage_race_dates)

number of stage races: 0 

[]


**Total Distance and Race Days**

In [172]:
season_distance = soup.find('div', style='border-top: 1px solid #ccc; margin-top: 1px; text-indent: 690px; padding-top: 2px; ').find('b').text
print('season distance (km):',season_distance)

season_race_days = soup.find('div', style='border-top: 1px solid #ccc; margin-top: 1px; text-indent: 690px; padding-top: 2px; ').contents[1].strip().split()[2]
print('season race days:',season_race_days)

season distance (km): 15390.6
season race days: 91


**_NOTE: Any date, rank, and points associated with distance = 0 on same row may be rollup or final values for stage races. May want to handle them by removing or combining/converting into a separate feature._**

**Sample of extracting all datapoints and loading into dictionary**

In [265]:
rider_seasons = {}

rider = str(soup.find('title').text.encode('latin-1').decode())
team = soup.find('span', class_='red').text.encode('latin-1').decode()
nation = soup.find_all('div')[16].find('a', class_='black').text
dob = soup.find_all('div')[16].find('span').contents[1:4]
dob_day = dob[0].strip()
dob_month = dob[2].split(' ')[1]
dob_year = dob[2].split(' ')[2]
dob_str = dob_year + ' ' + dob_month + ' ' + dob_day
height = soup.find(text='Weight:').findNext().contents[1].split()[0]
weight = str(soup.select('div:nth-of-type(2) > div:nth-of-type(3) > div:nth-of-type(4) > div:nth-of-type(2) > span > span > span:nth-of-type(2)')[0]).split('<span>')[1].split(' ')[1]
lt_pts_one_day = int(soup.find_all('div', style='display: inline-block; float: left; width: 23px; font: 9px/8px tahoma; color: #77B5C9; text-align: center;  ')[0].text)
lt_pts_gc = int(soup.find_all('div', style='display: inline-block; float: left; width: 23px; font: 9px/8px tahoma; color: #77B5C9; text-align: center;  ')[1].text)
lt_pts_tt = int(soup.find_all('div', style='display: inline-block; float: left; width: 23px; font: 9px/8px tahoma; color: #77B5C9; text-align: center;  ')[2].text)
lt_pts_sprint = int(soup.find_all('div', style='display: inline-block; float: left; width: 23px; font: 9px/8px tahoma; color: #77B5C9; text-align: center;  ')[3].text)
lt_total_points = lt_pts_one_day + lt_pts_gc + lt_pts_tt + lt_pts_sprint
pct_lt_oneday_pts = safe_div(lt_pts_one_day, lt_total_points)
pct_lt_gc_pts = safe_div(lt_pts_gc, lt_total_points)
pct_lt_tt_pts = safe_div(lt_pts_tt, lt_total_points)
pct_lt_sprint_pts = safe_div(lt_pts_sprint, lt_total_points)


seasons = len([year.text for year in soup.find('ul', class_='horiztree').contents])
season_distance = soup.find('div', style='border-top: 1px solid #ccc; margin-top: 1px; text-indent: 690px; padding-top: 2px; ').find('b').text
season_race_days = soup.find('div', style='border-top: 1px solid #ccc; margin-top: 1px; text-indent: 690px; padding-top: 2px; ').contents[1].strip().split()[2]
stage_race_dates = len([daterange.text for daterange in soup.find_all('span', style='width: 190px; ')])
uci_points = [point.text.replace(u'\xa0', u'') for point in soup.find_all('span', style='width: 80px;   ')][2::3]
uci_points = [0 if item == '' else int(item) for item in uci_points]
total_uci_points = sum(uci_points)
pcs_points = [point.text.replace(u'\xa0', u'') for point in soup.find_all('span', style='width: 80px;   ')][1::3]
pcs_points = [0 if item == '' else int(item) for item in pcs_points]
total_pcs_points = sum(pcs_points)

rider_seasons['rider'] = rider
rider_seasons['team'] = team
rider_seasons['nationality'] = nation
rider_seasons['date of birth'] = dob_str
rider_seasons['height'] = float(height)
rider_seasons['weight'] = float(weight)
rider_seasons['lifetime points one day races'] = lt_pts_one_day
rider_seasons['lifetime points general classification'] = lt_pts_gc
rider_seasons['lifetime points time trial'] = lt_pts_tt
rider_seasons['lifetime points sprint'] = lt_pts_sprint
rider_seasons['lifetime points total'] = lt_total_points
rider_seasons['seasons'] = seasons
rider_seasons['distance'] = float(season_distance)
rider_seasons['race days'] = int(season_race_days)
rider_seasons['stage races'] = len(stage_race_dates)
rider_seasons['uci points'] = total_uci_points
rider_seasons['pcs points'] = total_pcs_points

pp.pprint(rider_seasons)

{'date of birth': '1982 July 16',
 'distance': 15390.6,
 'height': 1.84,
 'lifetime points general classification': 1708,
 'lifetime points one day races': 4283,
 'lifetime points sprint': 7703,
 'lifetime points time trial': 760,
 'lifetime points total': 14454,
 'nationality': 'Germany',
 'pcs points': 1316,
 'race days': 91,
 'rider': 'André Greipel',
 'seasons': 16,
 'stage races': 9,
 'team': 'Lotto Soudal',
 'uci points': 1613,
 'weight': 75}


### For now let's only pull the total PCS points per season for each rider and load into dictionary
How well can we predict PCS points won in the most recent season using past seasons' points?  

**Assumptions:**
- PCS and UCI points are based on finishing results in each race
- PCS awards points deeper down the results list than UCI
- UCI awards more points to Grand Tours / top tier stage races

**Hypothesis:**
- A rider's performance in the next season is correlated with past seasons


In [469]:
def safe_div(n, d):
    return n / d if d else 0

In [501]:
all_riders_points = []
rider_seasons = {}

for file in sorted(glob.glob('riders/*.html')):
    season = file.split('.')[0].split('_')[1]
    
    # skip 2018 since it's incomplete
    if season != '2018':
        with open(file, 'r') as f:
            page = f.read()
            soup = BeautifulSoup(page, 'lxml')
            
            rider = str(soup.find('title').text.encode('latin-1').decode())
            seasons = [year.text for year in soup.find('ul', class_='horiztree').contents if year.text != '2018']
            relative_season = seasons.index(season)
            dob_year = soup.find('div', style='width: 230px; float: left; font: 12px/15px tahoma; ').find('span').contents[3].split()[1]
            age = int(season) - int(dob_year)
            nation = soup.find('a', class_='black').text
            try:
                height = float(soup.find(text='Weight:').findNext().contents[1].split()[0])
            except:
                height = float('nan')
            try:
                weight = float(str(soup.select('div:nth-of-type(2) > div:nth-of-type(3) > div:nth-of-type(4) > div:nth-of-type(2) > span > span > span:nth-of-type(2)')[0]).split('<span>')[1].split(' ')[1])
            except:
                weight = float('nan')
            
            pcs_points = [pcspoint.text.replace(u'\xa0', u'') for pcspoint in soup.find_all('span', style='width: 80px;   ')][1::3]
            pcs_points = [0 if item == '' else float(item) for item in pcs_points]
            total_pcs_points = sum(pcs_points)
            uci_points = [ucipoint.text.replace(u'\xa0', u'') for ucipoint in soup.find_all('span', style='width: 80px;   ')][2::3]
            uci_points = [0 if item == '' else float(item) for item in uci_points]
            total_uci_points = sum(uci_points)
            
            lt_pts_one_day = int(soup.find_all('div', style='display: inline-block; float: left; width: 23px; font: 9px/8px tahoma; color: #77B5C9; text-align: center;  ')[0].text)
            lt_pts_gc = int(soup.find_all('div', style='display: inline-block; float: left; width: 23px; font: 9px/8px tahoma; color: #77B5C9; text-align: center;  ')[1].text)
            lt_pts_tt = int(soup.find_all('div', style='display: inline-block; float: left; width: 23px; font: 9px/8px tahoma; color: #77B5C9; text-align: center;  ')[2].text)
            lt_pts_sprint = int(soup.find_all('div', style='display: inline-block; float: left; width: 23px; font: 9px/8px tahoma; color: #77B5C9; text-align: center;  ')[3].text)
            lt_total_points = lt_pts_one_day + lt_pts_gc + lt_pts_tt + lt_pts_sprint
            pct_lt_oneday_pts = safe_div(lt_pts_one_day, lt_total_points)
            pct_lt_gc_pts = safe_div(lt_pts_gc, lt_total_points)
            pct_lt_tt_pts = safe_div(lt_pts_tt, lt_total_points)
            pct_lt_sprint_pts = safe_div(lt_pts_sprint, lt_total_points)
            
            if 'rider' not in rider_seasons:
                rider_seasons['rider'] = rider
                rider_seasons['nationality'] = nation
                rider_seasons['height'] = height
                rider_seasons['weight'] = weight
                rider_seasons['seasons'] = len(seasons)
                rider_seasons['pct_oneday_pts'] = pct_lt_oneday_pts
                rider_seasons['pct_gc_pts'] = pct_lt_gc_pts
                rider_seasons['pct_tt_pts'] = pct_lt_tt_pts
                rider_seasons['pct_sprint_pts'] = pct_lt_sprint_pts
            rider_seasons['S' + str(relative_season) + ' pcs pts'] = total_pcs_points
            rider_seasons['S' + str(relative_season) + ' uci pts'] = total_uci_points
            rider_seasons['S' + str(relative_season) + ' age'] = age
            
            if season == seasons[0]:
                all_riders_points.append(rider_seasons.copy())
                rider_seasons.clear()
                print('added',rider,'season',season,' --- ',len(all_riders_points),'riders total',end='\r')

added Zico Waeytens season 2017  ---  1730 riders totaltotalaltotal total

**Pickle the list of dictionaries for modeling**

In [503]:
with open('riders_relative_seasons.pickle', 'wb') as handle:
    pickle.dump(all_riders_points, handle)