In [182]:
from datetime import date
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyitlib import discrete_random_variable as drv
import plotly.graph_objects as go
from scipy.stats import spearmanr
from sklearn.feature_selection import mutual_info_regression
from tqdm import tqdm

In [183]:
df = pd.read_csv('../data/women_velodata.csv')
df['day'] = df['day'].replace([0], [1])
df.insert(4, 'date', [
    date(year = int(df.loc[i, 'year']), month = int(df.loc[i, 'month']), day = int(df.loc[i, 'day']))
    for i in df.index
])

df = df.replace(['prologue'], ['itt'])

print(f'Total samples: {len(df.index)}')
df.head(5)

Total samples: 100829


Unnamed: 0,name,stage,points_scale,parcours_type,date,year,month,day,type,length,profile_score,avg_speed,vertical_meters,won_how,place,rider,team,age,time
0,giro-d-italia-femminile,stage-1,F-2.1.Stage,p0,2010-07-02,2010.0,7.0,2.0,standard,59.0,,44.678 km/h,,? - let us know!,1.0,TEUTENBERG Ina-Yoko,Team Columbia Women,35.0,0.0
1,giro-d-italia-femminile,stage-1,F-2.1.Stage,p0,2010-07-02,2010.0,7.0,2.0,standard,59.0,,44.678 km/h,,? - let us know!,2.0,BRONZINI Giorgia,Safi - Pasta Zara,26.0,0.0
2,giro-d-italia-femminile,stage-1,F-2.1.Stage,p0,2010-07-02,2010.0,7.0,2.0,standard,59.0,,44.678 km/h,,? - let us know!,3.0,WILD Kirsten,Cervelo Test Team,27.0,0.0
3,giro-d-italia-femminile,stage-1,F-2.1.Stage,p0,2010-07-02,2010.0,7.0,2.0,standard,59.0,,44.678 km/h,,? - let us know!,4.0,VOS Marianne,Nederland bloeit,23.0,0.0
4,giro-d-italia-femminile,stage-1,F-2.1.Stage,p0,2010-07-02,2010.0,7.0,2.0,standard,59.0,,44.678 km/h,,? - let us know!,5.0,GUARISCHI Barbara,S.C. Michela Fanini Rox,19.0,0.0


In [184]:
def get_stage_gaps(race, year, gc_limit = 10):
    
    race_df = df[df['name'] == race]
    race_df = race_df[race_df['year'] == year]
    
    # get top gc_limit on gc and their timegaps
    final_gc = race_df[race_df['stage'] == 'final-gc'].iloc[0: gc_limit, :]
    gc_winner = final_gc['rider'].iloc[0]
    
    # get dictionary of timegaps on each stage relative to the GC winner
    gaps = {rider: [] for rider in final_gc['rider']}
    stages = [s for s in race_df['stage'].unique() if 'ttt' not in s]
    for stage in stages:
        
        stage_data = race_df[race_df['stage'] == stage].drop_duplicates()
        for i in stage_data.index:
            
            rider = stage_data.loc[i, 'rider']
            if rider in gaps:
                time = stage_data.loc[i, 'time']
                gaps[rider].append(time)
    
        # normalize tt gaps relative to gc winner
        winner_gap = gaps[gc_winner][-1]
        for rider in gaps:
            gaps[rider][-1] -= winner_gap
    
    gaps_df = pd.DataFrame(data = gaps)
    gaps_df.index = stages
    return gaps_df

In [185]:
# ===== Quantifying the importance of TTs in a race ===== #

def err_lin(gc, gc_adj):
        
    lin = 0
    for i in range(len(gc)):
        
        diff = abs(gc[i] - gc_adj[i])
        lin += diff
    
    return lin

def err_sq(gc, gc_adj):
        
    sq = 0
    for i in range(len(gc)):
        
        diff = abs(gc[i] - gc_adj[i])
        sq += np.power(diff, 2)
    
    return sq

def gap_pearson(gc, gc_adj):
    return round(np.corrcoef(gc, gc_adj)[0, 1], 2)

In [186]:
def quantify_stage_gc_impact(race, year, stage, metric = err_lin, gc_limit = 10):
    
    gaps = get_stage_gaps(race, year, gc_limit = gc_limit)
    
    gc = gaps.loc['final-gc', :]
    stage_gaps = gaps.loc[stage, :]
    
    gc_adj = [
        gc[rider] - stage_gaps[rider]
        for rider in gc.index
    ]
    
    return metric(list(gc), gc_adj)

def plot_gc_gaps(race, year, gc_limit = 10):
    
    gaps = get_stage_gaps(race, year, gc_limit = gc_limit)
    
    x = list(gaps.columns)
    fig = go.Figure()
    
    for stage in gaps.index:
        if stage != 'final-gc':
            fig.add_trace(
                go.Scatter(x = x, y = list(gaps.loc[stage, :]), name = stage)
            )
    
    return fig

def plot_stage_impact_quantification(race, race_name, year, metric = err_sq, gc_limit = 10, fname = None):
    
    race_df = df[df['name'] == race]
    race_df = race_df[race_df['year'] == year]
    
    x = [s for s in race_df['stage'].unique() if s != 'final-gc' and 'ttt' not in s]
    y = []

    for stage in x:
        q = quantify_stage_gc_impact(race, year, stage, metric = metric, gc_limit = gc_limit)
        y.append(q)
    
    fig = go.Figure(layout = go.Layout(
        title = dict(
            text = f'Impact of stages on GC top {gc_limit} at the {year} {race_name}',
            x = 0.5
        ),
        width = 1000,
        height = 600
    ))
    fig.add_trace(
        go.Scatter(x = x, y = y, mode = 'markers+lines')
    )
    
    if fname is not None:
        fig.write_image(f'figures/stages-gc-impact/{fname}')
    
    return fig

In [187]:
display(get_stage_gaps('tour-de-suisse-women', 2022))
quantify_stage_gc_impact('tour-de-suisse-women', 2022, 'stage-1')

Unnamed: 0,BRAND Lucinda,FAULKNER Kristen,ROOIJAKKERS Pauliena,CHAPMAN Brodie,NEFF Jolanda,MACKAIJ Floortje,KOPPENBURG Clara,MAGNALDI Erica,BARIL Olivia,WILLIAMS Georgia
stage-1,0.0,50.0,0.0,59.0,50.0,50.0,0.0,50.0,50.0,50.0
stage-2-(itt),0.0,-67.0,25.0,63.0,21.0,59.0,133.0,111.0,63.0,-53.0
stage-3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
stage-4,0.0,15.0,36.0,15.0,86.0,59.0,50.0,15.0,85.0,225.0
final-gc,0.0,17.0,79.0,159.0,181.0,194.0,198.0,202.0,224.0,248.0


359.0

In [188]:
plot_stage_impact_quantification('tour-de-suisse-women', "Tour de Suisse Women", 2022, metric = err_sq)