# Velogames Model

The purpose of this script is to learn some basic Python skills by training a predictive model from data collected on the internet.<br>
The predictive model will attempt to develop the optimal Velogames team for the 2023 Grand Tour season


In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import re
import numpy as np 
import time
import csv

In [2]:
#Some basic decorators to simplify the code
def make_soup(url, suppress=0):
    
    if not suppress: print("Collecting Data from {}".format(url)) 
    page = requests.get(url)
    if not suppress: print("Status Code: {}".format(page.status_code))  #Status 200 = good
    
    soup = BeautifulSoup(page.content,'html.parser')
    return soup

def list_to_csv(path,csv_name,list_name):
    with open(path+csv_name,"w", newline="") as f:
        writer = csv.writer(f)
        writer.writerows(list_name)
    return

### Collect training data from Velogames
This section creates a csv file with the riders, thei category, their cost and their Velogames score from past grand tours

In [3]:
#Function to collect table headers from Velogames Riders page
def collect_riders_headers(url):
    headers=[]
    soup = make_soup(url)
    #Collect table headers - these can be different between races and years
    tbl_headers = soup.select('div[class="main-container"] th b')
    headers=[header.get_text() for header in tbl_headers]
    headers.insert(0,'Rider_Link') #insert a rider link header - doesnt exist in the website table
    
    return headers


#Function scrape table from Velogames Riders page
def collect_riders_data(url):
    rider_data = []
    soup = make_soup(url)
    tbl_data = soup.select('div[class="main-container"] tbody tr')
    for tbl_row in tbl_data:
        rider_data.append([tbl_cell.get_text(",") for tbl_cell in tbl_row.find_all('td')])
        rider_data[-1][0]=tbl_row.find('a')['href'] #insert in the rider link

    return rider_data



def collect_points_headers(url):

    soup = make_soup(url,1)
    
    tbl_headers = soup.select('table thead')
    headers = tbl_headers[0].get_text(',',strip=True).split(',')
    
    return headers[1:] #skips index 0 => Stage #



def collect_points_data(url):
    
    score_data=[]
    
    soup = make_soup(url,1)
    tbl_body = soup.select('table tbody tr')
    
    for row in tbl_body:
        score_data.append(row.get_text(',',strip=True).replace('-','0').split(','))
        
    #points data is given by individual stage but only the total by category is required    
    score_array = np.array(score_data)[0:,1:].astype(int) #convert to an array of integers - exclude first column (stage #)
    score_list = list(np.sum(score_array,axis=0)) #create list of the sum of the points by points category

    return score_list




In [None]:
#Update the below tuples with the combination of races and years that you want to scrape velogames for
#The 'race' value should match the spelling that velogames uses in their URL
#Note that there is no tour de france for 2020
races = ()#('france','spain','italy')
years = ()#('2022','2021','2020')
i=0

for race in races:
    for year in years:
             
        print('Pulling {} tour for {}.'.format(race,year))
        vg_rider_data=[]        
        
        #------------------------------------------------
        #Collect high level rider data from Rankings page
        #------------------------------------------------
        search_string = "https://www.velogames.com/"+race.replace('france','velogame')+"/"+year+"/riders.php"
        
        #Test that page exists, move to next if not found
        test = requests.get(search_string)
        if test.status_code != 200:
            print('Page not found error {}'.format(test.status_code))
            continue

        vg_rider_data=collect_riders_data(search_string)
        vg_rider_data.insert(0,collect_riders_headers(search_string))

        #------------------------------------------------
        #Collect and summarize detailed score data
        #------------------------------------------------

        search_string = "https://www.velogames.com/"+race.replace('france','velogame')+"/"+year+"/"+vg_rider_data[1][0]
        vg_rider_data[0].extend(collect_points_headers(search_string))

        for rider in vg_rider_data[1:]:
            i += 1
            if i % 5 == 0: print(i)
            search_string = "https://www.velogames.com/"+race.replace('france','velogame')+"/"+year+"/"+rider[0]
            rider.extend(collect_points_data(search_string))
            time.sleep(1)

        list_to_csv('/home/quixote/Documents/project_data/',race+'_'+year+'_velogames_rider_data.csv',vg_rider_data)

In [32]:
#Check the data
df = pd.read_csv('/home/quixote/Documents/project_data/italy_2022_velogames_rider_data.csv')
print(df)

                        Rider_Link            Rider  \
0  riderprofile.php?rider=20220570  Richard Carapaz   
1  riderprofile.php?rider=20220952      Simon Yates   
2  riderprofile.php?rider=20221068     João Almeida   
3  riderprofile.php?rider=20220173      Mikel Landa   

                        Team        Class  Cost Selected  Points   Stg   GC  \
0           INEOS Grenadiers  All Rounder    24    56.3%    2244  1223  922   
1  Team BikeExchange - Jayco      Climber    22    27.8%     666   480  144   
2          UAE Team Emirates  All Rounder    20    41.2%     985   660  297   
3       Bahrain - Victorious      Climber    18    11.4%    1432   613  679   

   PC  KOM  Spr  Sum  Bky  Ass   Tot  
0   0   17   30   10    0   42  2244  
1   1    4   15    0   20    2   666  
2   0    0   10    2    0   16   985  
3   0    0    0    6    0  134  1432  


In [72]:
del vg_rider_data

### Get the PCS data

This section collects results by rider and some basic information for each stage (flat, mountains, etc..) 

In [3]:
rider_list=[['PCS_key','Lname','Fname']]
n = 3 #n page requests will be made, n*100 riders will be collected

for i in range(n):
    
    #Query the PCS Rankings site to get a list of riders
    seach_string = "https://www.procyclingstats.com/rankings.php?date=2021-12-31&nation=&age=&zage=&page=smallerorequal&team=&offset="+str(100*i)+"&teamlevel=&filter=Filter"
    soup=make_soup(seach_string,1)
    
    #Use BS to pull the PCS name key, Fname and Lname
    riders = soup.select('tr a[href^="rider/"]')
    
    #Create a list of the PCS name key as well as the first and last name of the rider
    for rider in riders:
        rider_list.append([rider['href'].replace("rider/",""),rider.find('span').get_text(),rider.find('span').next_sibling.strip()])

In [4]:
n=5
print('Information for {} riders has been collected.\n'.format(len(rider_list)))

print('Top {} results are shown below:'.format(n))
rider_list[:n]

Information for 301 riders has been collected.

Top 5 results are shown below:


[['PCS_key', 'Lname', 'Fname'],
 ['tadej-pogacar', 'Pogačar', 'Tadej'],
 ['wout-van-aert', 'van Aert', 'Wout'],
 ['primoz-roglic', 'Roglič', 'Primož'],
 ['julian-alaphilippe', 'Alaphilippe', 'Julian']]

In [None]:
#https://www.procyclingstats.com/rider.php?xseason=2019&zxseason=&pxseason=equal&sort=date&race=&km1=&zkm1=&pkm1=equal&limit=200&offset=0&topx=&ztopx=&ptopx=smallerorequal&type=&znation=&continent=&pnts=&zpnts=&ppnts=equal&level=wt&rnk=&zrnk=&prnk=equal&exclude_tt=0&exclude_tt=1&racedate=&zracedate=&pracedate=equal&name=&pname=contains&category=&profile_score=&zprofile_score=&pprofile_score=largerorequal&filter=Filter&id=simone-consonni&p=results&s=all
#https://www.procyclingstats.com/calendar/stages-database

In [9]:
def collect_pcs_rider_results(url, rider):
    
    soup = make_soup(url,1)
    rider_data=[]
    
    for r in soup.select('tr')[1:]:
        rider_data.append(str(r.get_text(',')+","+rider).split(','))   
        
    return rider_data

Use the data collected above for the top n riders and collect the race results over the last several seasons.

In [29]:

pcs_rider_results = []
i=0
seasons = ('2019','2020','2021','2022')


for rider in rider_list[1:]:
    for season in seasons: 
        rider_key = rider[0]
        search_string='https://www.procyclingstats.com/rider.php?xseason='+season+'&zxseason=&pxseason=equal&sort=date&race=&km1=&zkm1=&pkm1=equal&limit=200&offset=0&topx=&ztopx=&ptopx=smallerorequal&type=&znation=&continent=&pnts=&zpnts=&ppnts=equal&level=wt&rnk=&zrnk=&prnk=equal&exclude_tt=0&exclude_tt=1&racedate=&zracedate=&pracedate=equal&name=&pname=contains&category=&profile_score=&zprofile_score=&pprofile_score=largerorequal&filter=Filter&id='+rider_key+'&p=results&s=all'
        pcs_rider_results.extend(collect_pcs_rider_results(search_string,rider_key))
        
        i += 1
        if i % 5 == 0: print(i, rider_key, season)
        time.sleep(2)
            
headers = soup.select('thead')[0].get_text(',').split(',')
pcs_rider_results.insert(0,headers)

list_to_csv('/home/quixote/Documents/project_data/','pcs_rider_results.csv',pcs_rider_results)

5 wout-van-aert 2019
10 primoz-roglic 2020
15 julian-alaphilippe 2021
20 joao-almeida 2022
25 jasper-philipsen 2019
30 egan-bernal 2020
35 adam-yates 2021
40 mathieu-van-der-poel 2022
45 alejandro-valverde 2019
50 matej-mohoric 2020
55 jonas-vingegaard-rasmussen 2021
60 richard-carapaz 2022
65 remco-evenepoel 2019
70 bauke-mollema 2020
75 tim-merlier 2021
80 kasper-asgreen 2022
85 mark-cavendish 2019
90 jasper-stuyven 2020
95 stefan-kung 2021
100 giacomo-nizzolo 2022
105 christophe-laporte 2019
110 danny-van-poppel 2020
115 matteo-trentin 2021
120 michael-matthews 2022
125 simon-yates 2019
130 damiano-caruso 2020
135 diego-ulissi 2021
140 peter-sagan 2022
145 ben-o-connor 2019
150 mikkel-honore 2020
155 maximilian-schachmann 2021
160 dylan-van-baarle 2022
165 aleksey-lutsenko 2019
170 romain-bardet 2020
175 wilco-kelderman 2021
180 florian-senechal 2022
185 richie-porte 2019
190 ion-izagirre 2020
195 pascal-ackermann 2021
200 pello-bilbao 2022
205 jack-haig 2019
210 nairo-quintana 2020

Now lets collect some information on the races themselves

In [142]:
pcs_stage_data = []
i=0
race_classes = ('2.UWT','1.UWT')
seasons = ('2019','2020','2021','2022')

race_class = race_classes[0]
season = seasons[3]

search_string = 'https://www.procyclingstats.com/races.php?season='+season+'&seasonb=2022&name=&pname=contains&nation=&class='+race_class+'&parcours=&kms=&zkms=&pkms=equal&limit=200&rt=&filter=Filter&s=stages-database'

soup = make_soup(search_string)

for r in soup.select('tr')[1:]:
    break
    

Collecting Data from https://www.procyclingstats.com/races.php?season=2022&seasonb=2022&name=&pname=contains&nation=&class=2.UWT&parcours=&kms=&zkms=&pkms=equal&limit=200&rt=&filter=Filter&s=stages-database
Status Code: 200


In [144]:
for r in soup.select('tr')[1:]:
    d1 = r.get_text(',').split(',') 
    d2 = r.select('span[class*="icon profile"]')
    d1.append(d2[0]['class'][2])
    print(d1)
    


['1', '2022-10-18', ' ', 'Gree-Tour of Guangxi | Stage 6', '2.UWT', '0', 'p0']
['2', '2022-10-17', ' ', 'Gree-Tour of Guangxi | Stage 5', '2.UWT', '0', 'p0']
['3', '2022-10-16', ' ', 'Gree-Tour of Guangxi | Stage 4', '2.UWT', '0', 'p0']
['4', '2022-10-15', ' ', 'Gree-Tour of Guangxi | Stage 3', '2.UWT', '0', 'p0']
['5', '2022-10-14', ' ', 'Gree-Tour of Guangxi | Stage 2', '2.UWT', '0', 'p0']
['6', '2022-10-13', ' ', 'Gree-Tour of Guangxi | Stage 1', '2.UWT', '0', 'p0']
['7', '2022-09-11', ' ', 'La Vuelta ciclista a España | Stage 21', '2.UWT', '96.7', 'p1']
['8', '2022-09-10', ' ', 'La Vuelta ciclista a España | Stage 20', '2.UWT', '181', 'p4']
['9', '2022-09-09', ' ', 'La Vuelta ciclista a España | Stage 19', '2.UWT', '138.3', 'p4']
['10', '2022-09-08', ' ', 'La Vuelta ciclista a España | Stage 18', '2.UWT', '192', 'p5']
['11', '2022-09-07', ' ', 'La Vuelta ciclista a España | Stage 17', '2.UWT', '162.3', 'p1']
['12', '2022-09-06', ' ', 'La Vuelta ciclista a España | Stage 16', '2.UWT