## Ratings visualization notebook

Notebook containing implementations for different Elo rating visualizations. I figured
this would be a little more intuitive than yet another script.

In [56]:
from bs4 import BeautifulSoup
from datetime import date, datetime, timedelta
import matplotlib.pyplot as plt
import os
import pandas as pd
import plotly.graph_objects as go
import requests
import sys

sys.path.append('/Users/samuelhmorton/indiv_projects/work/velo-research')
from ratings import utils

In [57]:
# ===== notebook constants ===== #
RACE_TYPE = 'GC'
GENDER = 'men'
RATINGS_DATA_PATH = f'../data/system-data/{RACE_TYPE.lower()}_{GENDER}.csv'
RACE_RESULTS_PATH = f'../data/{GENDER}_velodata.csv'
LOLLI_RIDERS_TO_PLOT = 15

# ===== individual race params ===== #
RACE = 'vuelta-a-espana'
YEAR = 2022

In [58]:
# ===== brief constant compatibility check ===== #
if RACE_TYPE not in ['GC', 'ITT', 'Sprints']:
    raise ValueError('Given race type must be GC, ITT, or Sprints.')

if GENDER not in ['men', 'women']:
    raise ValueError('Given gender must be men or women.')

if not os.path.exists(RATINGS_DATA_PATH):
    raise ValueError(f'Something went wrong. Given ratings path, {RATINGS_DATA_PATH}, does not exist.')

if not os.path.exists(RACE_RESULTS_PATH):
    raise ValueError(f'Something went wrong. Given race results path, {RACE_RESULTS_PATH}, does not exist.')

In [59]:
ratings = pd.read_csv(RATINGS_DATA_PATH)
results = pd.read_csv(RACE_RESULTS_PATH)

display(ratings.head(5))
results.head(5)

Unnamed: 0,year,month,day,NOCENTINI Rinaldo,REBELLIN Davide,RUTKIEWICZ Marek,PLAZA Rubén,HAYMAN Mathew,VALVERDE Alejandro,LAVERDE Luis Felipe,...,JEROMEL Vid,SANCHEZ Pablo,KRALJ Fabijan,YOUNG Xeno,ŠARC Leon,WATTS Kiaan,LOHINSKÝ Filip,JOALLAND Yaël,MURIAS Jakub,TRONCHON Bastien
0,2000,3,11,1497.894057,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
1,2000,3,15,1497.894057,1507.483303,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
2,2000,4,7,1491.47757,1531.808422,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
3,2000,4,26,1491.47757,1531.808422,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
4,2000,5,7,1491.47757,1531.808422,1500.0,1500.0,1500.0,1500.0,1500.0,...,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0


Unnamed: 0,name,stage,points_scale,parcours_type,year,month,day,type,length,profile_score,avg_speed,vertical_meters,won_how,place,rider,team,age,time
0,tour-de-france,stage-1,GT.A.Stage,p3,2021.0,6.0,26.0,standard,197.8,109.0,42.525 km/h,2843.0,2.2 km solo,1.0,ALAPHILIPPE Julian,Deceuninck - Quick Step,29.0,0.0
1,tour-de-france,stage-1,GT.A.Stage,p3,2021.0,6.0,26.0,standard,197.8,109.0,42.525 km/h,2843.0,2.2 km solo,2.0,MATTHEWS Michael,Team BikeExchange,30.0,8.0
2,tour-de-france,stage-1,GT.A.Stage,p3,2021.0,6.0,26.0,standard,197.8,109.0,42.525 km/h,2843.0,2.2 km solo,3.0,ROGLIČ Primož,Team Jumbo-Visma,31.0,8.0
3,tour-de-france,stage-1,GT.A.Stage,p3,2021.0,6.0,26.0,standard,197.8,109.0,42.525 km/h,2843.0,2.2 km solo,4.0,HAIG Jack,Bahrain - Victorious,27.0,8.0
4,tour-de-france,stage-1,GT.A.Stage,p3,2021.0,6.0,26.0,standard,197.8,109.0,42.525 km/h,2843.0,2.2 km solo,5.0,KELDERMAN Wilco,BORA - hansgrohe,30.0,8.0


In [60]:
STARTLIST = [rider for rider in utils.get_startlist(RACE, YEAR) if rider in ratings.columns]

In [61]:
def gen_table_figure(df, title, width = 1300, height = 650):
    
#     n = len(df.index)
#     font_color = []
#     for col in ['Rank', 'Rider', 'Delta', 'Team', 'Rating']:
#         if col == 'Delta':
            
#             delta_font = []
#             for val in df.Delta:
#                 if val == '':
#                     delta_font.append('black')
#                 elif val > 0:
#                     delta_font.append('green')
#                 else:
#                     delta_font.append('red')
#             font_color.append(delta_font)
        
#         else:
#             font_color.append(['black'] * n)
    
    fig = go.Figure(
        
        # table data
        data = [
            go.Table(
                header = dict(
                    values = list(df.columns),
                    font_color = 'black',
                    fill_color = '#FF8500',
                    align = 'center'
                ),
                cells = dict(
                    values = [df.Rank, df.Rider, df.Team, df.Rating],
                    #font_color = font_color,
                    fill_color = 'white',
                    align = 'center'
                )
            )
        ],
        
        # table layout settings
        layout = go.Layout(
            title = dict(
                text = title,
                x = 0.5,
                font_color = 'black'
            ),
            
            width = width,
            height = height,
        )
    )
    
    return fig

def gen_rankings_viz(ratings, results, rating_limit = 20, 
                     period_date = date(year = 2022, month = 6, day = 12), title = '', fname = None):
    """
    Given both ranking and race results data, output a table displaying the rankings
    in order for this race type and gender. If a filename is given with the fname
    variable, then the image will be saved.
    """
    
    # ensure df is sorted chronologically
    sorted_ratings = ratings#ratings.sort_values(by = ['year', 'month', 'day'])
    
    # get most recent ranking
    most_recent_ratings = sorted_ratings.iloc[-1, :]
    
    # get date of most recent ranking and then drop date cols
    ranking_date = date(
        year = int(most_recent_ratings['year']),
        month = int(most_recent_ratings['month']),
        day = int(most_recent_ratings['day'])
    )
    most_recent_ratings = most_recent_ratings.drop(index = ['year', 'month', 'day'])
    
    delta_idx = len(ratings.index) - 1
    deltas = {}
    for i in range(len(ratings.index) - 1, -1, -1):
        idx_date = date(
            year = int(ratings['year'].iloc[i]),
            month = int(ratings['month'].iloc[i]),
            day = int(ratings['day'].iloc[i])
        )
        if idx_date >= period_date:
            delta_idx = i
        else:
            break
    
    deltas = [
            round(most_recent_ratings[col] - ratings[col].iloc[delta_idx], 2)
            for col in most_recent_ratings.index
    ]
    for col in most_recent_ratings.index:
        rating_diff = most_recent_ratings[col] - ratings[col].iloc[delta_idx]
        if col == 'GROENEWEGEN Dylan':
            print(most_recent_ratings[col], ratings[col].iloc[delta_idx])
    #deltas = [d if d != 0 else '' for d in deltas]
    
    # get rider names and ratings
    riders = sorted(list(zip(
        list(most_recent_ratings.index),
        [round(r, 2) for r in most_recent_ratings],
        deltas
    )), key = lambda t: t[1], reverse = True)
    
    # get only rider individual race days
    indiv_race_days = results[results['type'] != 'gc']
    
    # get rider race days and teams
    rider_names = []
    rider_ratings = []
    rider_deltas = []
    race_days = []
    teams = []
    num = 0
    for rider in riders:

        rider_res = indiv_race_days[indiv_race_days['rider'] == rider[0]].sort_values(by = ['year', 'month', 'day'])
        rider_year_df = rider_res[rider_res['year'] == ranking_date.year]
        
        if len(rider_year_df.index) > 0:
            rider_names.append(rider[0])
            rider_ratings.append(rider[1])
            rider_deltas.append(rider[2])
            teams.append(rider_res['team'].iloc[-1])
            race_days.append(len(rider_year_df.index))
            num += 1
        
        if num >= rating_limit:
            break
    
    table_df = pd.DataFrame(data = {
        'Rank': [i + 1 for i in range(len(rider_names))],
        'Rider': rider_names,
        #'Delta': rider_deltas,
        'Team': teams,
        'Rating': rider_ratings,
    })
    
    fig = gen_table_figure(table_df, title)
    
    if fname is not None:
        fig.write_image(f'figures/{fname}')
    
    return fig

In [62]:
# ===== generate and save most recent rankings for RACE_TYPE ===== #
curr_date = datetime.today().strftime('%Y%m%d')
fname = f'{RACE_TYPE.lower()}-{GENDER}-{curr_date}.png'
gen_rankings_viz(ratings, results, title = f'{RACE_TYPE} Elo Ranking', fname = fname)

1458.0007030223346 1470.8120794127842


In [63]:
def gen_lolli_plot(startlist_ratings, startlist, race_name, rating_type, year, width = 1000, height = 750):
    """
    Given the starlist for a race, output a lollipop plot ranking the riders
    by their Elo rating.
    """
    
    # get the rating for each rider in the order that they appear in startlist
    ratings = [startlist_ratings[rider].iloc[-1] for rider in startlist]
    
    fig = go.Figure()
    
    # add a line for each rider in the given startlist
    for i in range(0, len(startlist)):
        fig.add_shape(
            type = 'line',
            
            # x and y coordinates determine the length of the line
            x0 = min(ratings) - 50,
            y0 = i,
            x1 = list(reversed(ratings))[i],
            y1 = i,

            line = dict(color = 'darkblue', width = 3)
        )
    
    # rider ratings are effectively a scatterplot, with rider name as the y axis
    ratings_rev = [round(r, 2) for r in list(reversed(ratings))]
    riders_rev = list(reversed(startlist))
    fig.add_trace(
        go.Scatter(
            x = ratings_rev, 
            y = riders_rev,
            text = ratings_rev,
            mode = 'markers+text',
            marker_color = 'darkblue',
            marker_size = 14,
            textposition = 'top center'
    ))
    
    # ensure the race name to be ranked is formatted properly
    race_name = race_name.replace('-', ' ').title()
    
    # update the layout of the plot to add finishing touches
    fig.update_layout(
        title = dict(
            text = f'{race_name} {year} {rating_type} Contenders',
            x = 0.5,
            y = 0.9,
            font_color = 'black'
        ),
        
        width = width,
        height = height,
        
        paper_bgcolor = 'white',
        plot_bgcolor = 'white',
        xaxis = dict(showgrid = False),
        yaxis = dict(showgrid = False)
    )

    return fig

def plot_startlist_timeseries(ratings, given_startlist, race_name, 
            year, rating_type, fname = None, riders_to_plot = 20):
    """
    Driver method for generating the lolliplop plot from the given startlist of a race. Can be applied
    to the Elo ratings of all riders (not just the startlist of a given race) if given_startlist is None.
    
    As with the table above, the figure can be saved if a filename is given using the fname param.
    """
    
    if given_startlist is None:
        given_startlist = [col for col in ratings.columns if col not in ['year', 'month', 'day']]
    
    # adjust the given Elo ratings to include only the ratings of the top riders in the startlist
    startlist = [rider for rider in given_startlist if rider in ratings.columns]
    startlist.sort(key = lambda rider: ratings.iloc[-1, :].loc[rider], reverse = True)
    startlist = startlist[0: riders_to_plot]
    startlist_ratings = ratings[['year', 'month', 'day'] + startlist]
    
    # generate lollipop figure
    fig = gen_lolli_plot(startlist_ratings, startlist, race_name, rating_type, year)
    
    if fname is not None:
        fig.write_image(f'figures/lollis/{fname}')
    
    return fig

In [64]:
# plot the lollipop
plot_startlist_timeseries(
    ratings, STARTLIST, RACE.replace('-', ' ').title(), YEAR, RACE_TYPE,
    riders_to_plot = LOLLI_RIDERS_TO_PLOT, fname = f'{RACE}-{YEAR}-{RACE_TYPE.lower()}-{GENDER}-lollipop.png'
)

In [65]:
def heatmap(ratings, startlist, race_name, year, rating_type, to_include = 15, q_base = 10, q_div = 400, fname = None):    
    startlist_ratings = {rider: ratings[rider].iloc[-1] for rider in startlist if rider in ratings.columns}
    startlist_ratings = list(sorted(startlist_ratings.items(), key = lambda tup: tup[1], reverse = True))[0: to_include]
    
    to_include = to_include if to_include <= len(startlist) else len(startlist)
    
    Z = [
        [
            round(
                utils.get_elo_probabilities(
                    startlist_ratings[i][1],
                    startlist_ratings[j][1],
                    q_base, q_div
                )[0],
                3
            )
            if startlist_ratings[i] != startlist_ratings[j] else None 
            for j in range(to_include)
        ]
        for i in range(to_include - 1, -1, -1)
    ]
    

    fig = go.Figure(
        data = go.Heatmap(
            z = Z,
            x = [t[0] for t in startlist_ratings][0: to_include],
            y = list(reversed([t[0] for t in startlist_ratings][0: to_include])),
            text = Z,
            texttemplate = "%{text}",
            textfont = {"size": 10},
            colorscale = 'blues'
    ))
    fig.update_layout(
        title = dict(
                text = f'{race_name} {year} {rating_type} Contenders',
                x = 0.5,
                y = 0.9,
                font_color = 'black'
            ),
            xaxis = dict(side = 'bottom'),
            plot_bgcolor = 'white',
            width = 1000,
            height = 600
    )
#     ), layout = go.Layout(
#             title = dict(
#                 text = f'{race_name} {year} {rating_type} Contenders',
#                 x = 0.5,
#                 y = 0.9,
#                 font_color = 'black'
#             ),
#             xaxis = dict(side = 'bottom'), plot_bgcolor = 'white'),
#             width = 1000
#     )
    
    if fname is not None:
        fig.write_image(f'figures/heatmaps/{fname}')

    return fig

In [66]:
heatmap(ratings, STARTLIST, RACE.replace('-', ' ').title(), YEAR, RACE_TYPE,
        to_include = 10, q_base = 10, q_div = 150,
        fname = f'{RACE}-{YEAR}-{RACE_TYPE.lower()}-{GENDER}-heatmap.png')