In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm

In [20]:
runner = pd.read_csv("../../../output/data_clean/cleaned_runner.csv", index_col=0)
race = pd.read_csv("../../../output/data_clean/cleaned_race_equivalent_distance.csv", index_col= 0)
runner.age_range = pd.Categorical(
    runner.age_range, categories=['16-25','26-40','40-60','61+','Undefined'],ordered=True
)
runner = runner.dropna()

In [21]:
race = race.assign(
    date = pd.to_datetime(race[['month', 'day', 'year']])
)
join = pd.merge(runner, race, left_on="race_year_id", right_on="race_year_id", how="left")
join = join.loc[join.age_range != "Undefined"]
join = join.assign(
    relative_ranking = join.ranking / join.frequency_in_ranking
)

In [22]:
join.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 112176 entries, 0 to 112185
Data columns (total 31 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   race_year_id          112176 non-null  int64         
 1   ranking               112176 non-null  float64       
 2   runner                112176 non-null  object        
 3   time                  112176 non-null  object        
 4   age                   112176 non-null  float64       
 5   gender                112176 non-null  object        
 6   nationality           112176 non-null  object        
 7   time_in_seconds       112176 non-null  float64       
 8   runner_id             112176 non-null  object        
 9   age_course            112176 non-null  float64       
 10  age_range             112176 non-null  category      
 11  speed                 112176 non-null  float64       
 12  equivalent_speed      112176 non-null  float64       
 13 

## Vérification de la formule suivante

To predict your timing (T2) for a race of distance D2, the following formula is used: T2 = T1 * ((D2/D1)^1.06) where T1 and D1 are the time and distance, respectively, of your timed run. This formula was proposed by Pete Riegel, and published in Runner’s World by Owen Anderson in 1997. It roughly says that a person’s speed declines by around 6% when the distance doubles.

T2 = T1 * ((D2/D1)^1.06)

In [23]:
runners_occ = join['runner_id'].value_counts().fillna(0).astype(int)
runners_occ = pd.DataFrame(runners_occ).reset_index()
runners_occ = runners_occ.rename( columns= {'index':'runner_id','runner_id':'occ'})
runners_occ

Unnamed: 0,runner_id,occ
0,DONNELLYSusanUSAW58.0,39
1,FANCETTKennethGBRM72.0,37
2,CARTERWilliamsUSAM63.0,33
3,SMITHMikeUSAM64.0,32
4,ETTINGHAUSENEdUSAM59.0,30
...,...,...
64188,HOARAUOlivierFRAM47.0,1
64189,BAUGERGaelFRAM36.0,1
64190,LAVERNAYAppolinaireSimonFRAM63.0,1
64191,LANDRYDavidFRAM49.0,1


In [71]:
df = join.loc[join.runner_id == 'FANCETTKennethGBRM72.0']

In [72]:
df = df.reset_index()
df

Unnamed: 0,index,race_year_id,ranking,runner,time,age,gender,nationality,time_in_seconds,runner_id,...,participants,country_code,year,month,day,frequency_in_ranking,equivalent_distance,weighted_elevation,date,relative_ranking
0,2005,67218,72.0,FANCETT Kenneth,28H 28M 23S,72.0,M,GBR,102503.0,FANCETTKennethGBRM72.0,...,300,GBR,2021,8,7,111,196.5,0.159288,2021-08-07,0.648649
1,4881,67422,93.0,FANCETT Kenneth,23H 36M 39S,72.0,M,GBR,84999.0,FANCETTKennethGBRM72.0,...,300,GBR,2021,5,8,188,193.4,0.165977,2021-05-08,0.494681
2,8446,56693,79.0,FANCETT Kenneth,23H 10M 44S,72.0,M,GBR,83444.0,FANCETTKennethGBRM72.0,...,250,GBR,2020,10,10,151,182.1,0.113125,2020-10-10,0.523179
3,9814,49901,55.0,FANCETT Kenneth,22H 11M 28S,72.0,M,GBR,79888.0,FANCETTKennethGBRM72.0,...,300,GBR,2020,9,5,183,192.9,0.163815,2020-09-05,0.300546
4,10931,51393,33.0,FANCETT Kenneth,25H 21M 31S,72.0,M,GBR,91291.0,FANCETTKennethGBRM72.0,...,300,GBR,2020,8,8,107,202.6,0.169793,2020-08-08,0.308411
5,16896,40741,44.0,FANCETT Kenneth,22H 11M 58S,72.0,M,GBR,79918.0,FANCETTKennethGBRM72.0,...,250,GBR,2019,10,12,154,182.1,0.113125,2019-10-12,0.285714
6,29747,36627,24.0,FANCETT Kenneth,37H 47M 29S,72.0,M,GBR,136049.0,FANCETTKennethGBRM72.0,...,150,THA,2018,11,9,142,243.2,0.321957,2018-11-09,0.169014
7,30839,32713,21.0,FANCETT Kenneth,30H 20M 18S,72.0,M,GBR,109218.0,FANCETTKennethGBRM72.0,...,60,USA,2018,10,19,60,213.6,0.20412,2018-10-19,0.35
8,32959,30411,38.0,FANCETT Kenneth,21H 45M 57S,72.0,M,GBR,78357.0,FANCETTKennethGBRM72.0,...,250,GBR,2018,10,13,168,176.9,0.089316,2018-10-13,0.22619
9,38962,29749,30.0,FANCETT Kenneth,32H 19M 46S,72.0,M,GBR,116386.0,FANCETTKennethGBRM72.0,...,104,USA,2018,8,17,104,215.9,0.241315,2018-08-17,0.288462


In [73]:
T2 = df.iloc[29]['time_in_seconds'] * (df.iloc[28]['equivalent_distance']/df.iloc[29]['equivalent_distance'])**1.06
T2

78314.73166893698

In [74]:
def predict_race_time(race1,race2):
    T2 =  race1['time_in_seconds'] * (race2['equivalent_distance'] / race1['equivalent_distance'])**1.06
    real_T2 = race2['time_in_seconds']
    print(f"Data to estimate:{real_T2}")
    print(f"Data estimated:{T2}")
    print(f"Relative error: {round((real_T2 / T2 * 100)- 100, 3)} %")
    

In [75]:
predict_race_time(df.iloc[29], df.iloc[28])

Data to estimate:75409.0
Data estimated:78314.73166893698
Relative error: -3.71 %


In [82]:
predict_race_time(df.iloc[36], df.iloc[30])

Data to estimate:75044.0
Data estimated:80029.78221654234
Relative error: -6.23 %
