## Ratings visualization notebook

Notebook containing implementations for different Elo rating visualizations. I figured
this would be a little more intuitive than yet another script.

In [179]:
from bs4 import BeautifulSoup
from datetime import date, datetime, timedelta
import matplotlib.pyplot as plt
import os
import pandas as pd
import plotly.graph_objects as go
import requests

In [221]:
# ===== notebook constants ===== #
RACE_TYPE = 'GC'
GENDER = 'men'
RATINGS_DATA_PATH = f'../data/system-data/{RACE_TYPE.lower()}_{GENDER}.csv'
RACE_RESULTS_PATH = f'../data/{GENDER}_velodata.csv'

# ===== individual race params ===== #
RACE = 'tour-de-suisse'
YEAR = 2022

In [222]:
# ===== brief constant compatibility check ===== #
if RACE_TYPE not in ['GC', 'ITT', 'SPRINTS']:
    raise ValueError('Given race type must be GC, ITT, or SPRINTS.')

if GENDER not in ['men', 'women']:
    raise ValueError('Given gender must be men or women.')

if not os.path.exists(RATINGS_DATA_PATH):
    raise ValueError(f'Something went wrong. Given ratings path, {RATINGS_DATA_PATH}, does not exist.')

if not os.path.exists(RACE_RESULTS_PATH):
    raise ValueError(f'Something went wrong. Given race results path, {RACE_RESULTS_PATH}, does not exist.')

In [223]:
ratings = pd.read_csv(RATINGS_DATA_PATH)
results = pd.read_csv(RACE_RESULTS_PATH)

display(ratings.head(5))
results.head(5)

Unnamed: 0,year,month,day,GREIPEL André,SÁNCHEZ Luis León,ROELANDTS Jürgen,ROJAS José Joaquín,SABATINI Fabio,IZAGIRRE Gorka,SAGAN Peter,...,SANCHEZ Ruben,CARPENE Gianmarco,SENS Connor,ACCO Alessio,ZECCHIN Michael,KOPECKÝ Matyáš,BARONI Alessandro,DELLA LUNGA Francesco,SKERL Daniel,MORO Stefano
0,2010,1,24,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
1,2010,1,24,1509.497719,1504.070962,1500.667603,1500.154592,1499.944018,1499.935795,1499.954348,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2,2010,2,12,1509.497719,1504.070962,1500.667603,1500.154592,1499.944018,1499.935795,1499.954348,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
3,2010,2,12,1509.497719,1504.070962,1500.667603,1500.154592,1499.944018,1499.935795,1499.954348,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
4,2010,2,19,1509.497719,1504.070962,1500.667603,1500.154592,1499.944018,1499.935795,1499.954348,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0


Unnamed: 0,name,stage,points_scale,parcours_type,year,month,day,type,length,profile_score,avg_speed,vertical_meters,won_how,place,rider,team,age,time
0,tour-de-france,stage-1,GT.A.Stage,p3,2021.0,6.0,26.0,standard,197.8,109.0,42.525 km/h,2843.0,2.2 km solo,1.0,ALAPHILIPPE Julian,Deceuninck - Quick Step,29.0,0.0
1,tour-de-france,stage-1,GT.A.Stage,p3,2021.0,6.0,26.0,standard,197.8,109.0,42.525 km/h,2843.0,2.2 km solo,2.0,MATTHEWS Michael,Team BikeExchange,30.0,8.0
2,tour-de-france,stage-1,GT.A.Stage,p3,2021.0,6.0,26.0,standard,197.8,109.0,42.525 km/h,2843.0,2.2 km solo,3.0,ROGLIČ Primož,Team Jumbo-Visma,31.0,8.0
3,tour-de-france,stage-1,GT.A.Stage,p3,2021.0,6.0,26.0,standard,197.8,109.0,42.525 km/h,2843.0,2.2 km solo,4.0,HAIG Jack,Bahrain - Victorious,27.0,8.0
4,tour-de-france,stage-1,GT.A.Stage,p3,2021.0,6.0,26.0,standard,197.8,109.0,42.525 km/h,2843.0,2.2 km solo,5.0,KELDERMAN Wilco,BORA - hansgrohe,30.0,8.0


In [224]:
def gen_table_figure(df, title, width = 1300, height = 650):
    
    fig = go.Figure(
        
        # table data
        data = [
            go.Table(
                header = dict(
                    values = list(df.columns),
                    font_color = 'black',
                    fill_color = '#FF8500',
                    align = 'center'
                ),
                cells = dict(
                    values = [df.Rank, df.Rider, df.Team, df.Rating],
                    font_color = 'black',
                    fill_color = 'white',
                    align = 'center'
                )
            )
        ],
        
        # table layout settings
        layout = go.Layout(
            title = dict(
                text = title,
                x = 0.5,
                font_color = 'black'
            ),
            
            width = width,
            height = height,
        )
    )
    
    return fig

def gen_rankings_viz(ratings, results, rating_limit = 20, title = '', fname = None):
    """
    Given both ranking and race results data, output a table displaying the rankings
    in order for this race type and gender. If a filename is given with the fname
    variable, then the image will be saved.
    """
    
    # ensure df is sorted chronologically
    sorted_ratings = ratings.sort_values(by = ['year', 'month', 'day'])
    
    # get most recent ranking
    most_recent_ratings = sorted_ratings.iloc[-1, :]
    
    # get date of most recent ranking and then drop date cols
    ranking_date = date(
        year = int(most_recent_ratings['year']),
        month = int(most_recent_ratings['month']),
        day = int(most_recent_ratings['day'])
    )
    most_recent_ratings = most_recent_ratings.drop(index = ['year', 'month', 'day'])
    
    # get rider names and ratings
    riders = sorted(list(zip(
        list(most_recent_ratings.index),
        [round(r, 2) for r in most_recent_ratings]
    )), key = lambda t: t[1], reverse = True)
    
    # get only rider individual race days
    indiv_race_days = results[results['type'] != 'gc']
    
    # get rider race days and teams
    rider_names = []
    rider_ratings = []
    race_days = []
    teams = []
    num = 0
    for rider in riders:

        rider_res = indiv_race_days[indiv_race_days['rider'] == rider[0]].sort_values(by = ['year', 'month', 'day'])
        rider_year_df = rider_res[rider_res['year'] == ranking_date.year]
        
        if len(rider_year_df.index) > 0:
            rider_names.append(rider[0])
            rider_ratings.append(rider[1])
            teams.append(rider_res['team'].iloc[-1])
            race_days.append(len(rider_year_df.index))
            num += 1
        
        if num >= rating_limit:
            break
    
    table_df = pd.DataFrame(data = {
        'Rank': [i + 1 for i in range(len(rider_names))],
        'Rider': rider_names,
        'Team': teams,
        'Rating': rider_ratings,
    })
    
    fig = gen_table_figure(table_df, title)
    
    if fname is not None:
        fig.write_image(f'figures/{fname}')
    
    return fig

In [225]:
# ===== generate and save most recent rankings for RACE_TYPE ===== #
curr_date = datetime.today().strftime('%Y%m%d')
fname = f'{RACE_TYPE.lower()}-{GENDER}-{curr_date}.png'
gen_rankings_viz(ratings, results, title = f'{RACE_TYPE} Elo Ranking', fname = fname)

In [226]:
def get_startlist(name, year):
    """
    Given the name of a race and the year, return a list of riders on the startlist
    for the race.
    
    NOTE: name param must be given in the format which ProCyclingStats uses in its
    links. For example, if you'd like the startlist for the Tour de Suisse, you would
    given 'tour-de-suisse' as the name param.
    """
    
    # get page html
    link = f'https://www.procyclingstats.com/race/{name}/{year}/gc/startlist'
    page = requests.get(link)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # get the riders from each team starting the race
    teams = {}
    lis = soup.find_all('li', class_ = 'team')
    for li in lis:
        team = li.find_all('a')[0].text
        uls = li.find_all('ul')[0]
        roster = [rider.find_all('a')[0].text for rider in uls.find_all('li')]
        teams[team] = roster
    
    # return the startlist as a list of rider names
    startlist = [rider for team in teams for rider in teams[team]]
    return startlist

In [227]:
def gen_lolli_plot(startlist_ratings, startlist, race_name, rating_type, year, width = 1000, height = 750):
    """
    Given the starlist for a race, output a lollipop plot ranking the riders
    by their Elo rating.
    """
    
    # get the rating for each rider in the order that they appear in startlist
    ratings = [startlist_ratings[rider].iloc[-1] for rider in startlist]
    
    fig = go.Figure()
    
    # add a line for each rider in the given startlist
    for i in range(0, len(startlist)):
        fig.add_shape(
            type = 'line',
            
            # x and y coordinates determine the length of the line
            x0 = min(ratings) - 50,
            y0 = i,
            x1 = list(reversed(ratings))[i],
            y1 = i,

            line = dict(color = 'darkblue', width = 3)
        )
    
    # rider ratings are effectively a scatterplot, with rider name as the y axis
    ratings_rev = [round(r, 2) for r in list(reversed(ratings))]
    riders_rev = list(reversed(startlist))
    fig.add_trace(
        go.Scatter(
            x = ratings_rev, 
            y = riders_rev,
            text = ratings_rev,
            mode = 'markers+text',
            marker_color = 'darkblue',
            marker_size = 14,
            textposition = 'top center'
    ))
    
    # ensure the race name to be ranked is formatted properly
    race_name = race_name.replace('-', ' ').title()
    
    # update the layout of the plot to add finishing touches
    fig.update_layout(
        title = dict(
            text = f'{race_name} {year} {rating_type} Contenders',
            x = 0.5,
            y = 0.9,
            font_color = 'black'
        ),
        
        width = width,
        height = height,
        
        paper_bgcolor = 'white',
        plot_bgcolor = 'white',
        xaxis = dict(showgrid = False),
        yaxis = dict(showgrid = False)
    )

    return fig

def plot_startlist_timeseries(ratings, given_startlist, race_name, 
            year, rating_type, fname = None, riders_to_plot = 20):
    """
    Driver method for generating the lolliplop plot from the given startlist of a race. Can be applied
    to the Elo ratings of all riders (not just the startlist of a given race) if given_startlist is None.
    
    As with the table above, the figure can be saved if a filename is given using the fname param.
    """
    
    if given_startlist is None:
        given_startlist = [col for col in ratings.columns if col not in ['year', 'month', 'day']]
    
    # adjust the given Elo ratings to include only the ratings of the top riders in the startlist
    startlist = [rider for rider in given_startlist if rider in ratings.columns]
    startlist.sort(key = lambda rider: ratings.iloc[-1, :].loc[rider], reverse = True)
    startlist = startlist[0: riders_to_plot]
    startlist_ratings = ratings[['year', 'month', 'day'] + startlist]
    
    # generate lollipop figure
    fig = gen_lolli_plot(startlist_ratings, startlist, race_name, rating_type, year)
    
    if fname is not None:
        fig.write_image(f'figures/{fname}')
    
    return fig

In [228]:
# get a startlist to plot
startlist = [rider for rider in get_startlist(RACE, YEAR) if rider in ratings.columns]

# plot the lollipop
plot_startlist_timeseries(
    ratings, startlist, RACE.replace('-', '').title(), YEAR, RACE_TYPE,
    riders_to_plot = 15, fname = f'{race}-{year}-{RACE_TYPE.lower()}-{GENDER}-lollipop.png'
)