In [None]:
import numpy as np
import pandas as pd
import os
import pickle 
import procyclingstats
import power_helper as ph


In [None]:
# Get stage data from 1985-2023
from procyclingstats import Race, Stage

stages_dict = {"year":[0],"Race_url": [""], 
                   "stage": [""], "stage_url":[""], "stage_type":[""],  "profile_icon":[""],
                   "stage_winner":[""], "stage_winner_url":[""], 
                   "stage_winner_time":[""], "stage_distance":[0.0], 
                   "stage_vertical_meters":[0.0], "gc_leader":[""], 
                   "gc_leader_url":[""], "gc_stage_time":[""]}

for year in range(1985,2024):
    race = Race("race/tour-de-france/" + str(year))
    for race_stage in race.stages("stage_url"):
        stage = Stage(race_stage["stage_url"])
        # No gc results in protested stages 
        # No results in TTT 
        if (stage.stage_type() != "TTT"):
            if len(stage.results("rider_name")) > 0 and len(stage.gc("rider_name")) > 0: 
                stages_dict["year"].append(year)
                stages_dict["Race_url"].append("race/tour-de-france/" + str(year))
                stages_dict["stage"].append(stage.arrival())
                stages_dict["stage_url"].append(stage.url)
                stages_dict["stage_type"].append(stage.stage_type())
                stages_dict["profile_icon"].append(stage.profile_icon())
                stages_dict["stage_winner"].append(stage.results('rider_name')[0])
                stages_dict["stage_winner_url"].append(stage.results('rider_url')[0])
                stages_dict["stage_winner_time"].append(stage.results('time')[0])
                stages_dict["stage_distance"].append(stage.distance())
                stages_dict["stage_vertical_meters"].append(stage.vertical_meters())
                stages_dict["gc_leader"].append(stage.gc("rider_name")[0])
                stages_dict["gc_leader_url"].append(stage.gc("rider_url")[0])
                stages_dict["gc_stage_time"].append([d for d in stage.results("rider_name","time") if d['rider_name'] in stage.gc("rider_name")[0]['rider_name']][0]['time'])

        
    

df = pd.DataFrame(stages_dict)

In [None]:
df.to_csv("Data_raw.csv")

In [None]:
# Convert_dataset
import ast
def to_seconds(time):
    h, m, s = time.split(':')
    return int(h) * 3600 + int(m) * 60 + int(s)

# Read raw data
df = pd.read_csv("Data_raw.csv").iloc[1:]

df.stage_winner = df.stage_winner.apply(lambda x: ast.literal_eval(x)['rider_name']) # convert json to text
df.stage_winner_url = df.stage_winner_url.apply(lambda x: ast.literal_eval(x)['rider_url']) # convert json to text
df.stage_winner_time = df.stage_winner_time.apply(lambda x: ast.literal_eval(x)['time']) # convert json to text
df.insert(6, "stage_winner_time_str", df.stage_winner_time, True) # insert time
df.stage_winner_time = df.stage_winner_time.apply(to_seconds) # convert time to seconds for power calculations
df.gc_leader = df.gc_leader.apply(lambda x: ast.literal_eval(x)['rider_name']) # convert json to text
df.gc_leader_url = df.gc_leader_url.apply(lambda x: ast.literal_eval(x)['rider_url']) # convert json to text

df.insert(12, "gc_stage_time_str", df.gc_stage_time, True) # Insert gc time
df.gc_stage_time = df.gc_stage_time.apply(to_seconds) # convert gc time to seconds
df['power'] = np.zeros(len(df)) # add power column
df['gc_weight'] = np.zeros(len(df)) # add gc_weight column
df['gc_height'] = np.zeros(len(df)) # add gc_height column
df['stage_grade'] = df.stage_vertical_meters / (df.stage_distance * 1000) # add stage_grade
df['gc_speed'] = (df.stage_distance *1000) / df.gc_stage_time # add velocity

from procyclingstats import Stage, Rider

# Get height and weight data for gc riders
for i, stage in enumerate(df.itertuples(), 1):
    rider = Rider(stage.gc_leader_url)
    
    try:
        df.loc[i, 'gc_weight'] = rider.weight()
        df.loc[i, 'gc_height'] = rider.height()
    except (AttributeError, IndexError):
        # print("No weight/height for " + rider.name())
        df.loc[i, 'gc_weight'] = 65
        df.loc[i, 'gc_height'] = 1.70
        
powers = []
for row in df.itertuples():
    powers.append(ph.cycling_power_profile(row.stage_grade, row.gc_weight, row.gc_speed, row.profile_icon))
    
df.power = powers