In [None]:
import os
import re
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from datetime import datetime, timedelta

# Load the data
ultra_marathon_loc = os.path.join(os.getcwd(), '2023-24-CW3-datasets',
                                'ultramarathon_records', 'TWO_CENTURIES_OF_UM_RACES.csv')
ultra_marathon = pd.read_csv(ultra_marathon_loc)
print("The .csv has been loaded.")



#_____________________________Clean the data_____________________________________


# Rename columns
ultra_marathon.rename(columns={
    'Year of event':                'Year',         # int
    'Event dates':                  'Dates',        # [datetime]
    'Event name':                   'Event_name',   # str
    'Event distance/length':        'Length',       # str; (km, mi or hours)
    'Event number of finishers':    'Finishers',    # int
    'Athlete performance':          'Performance',  # str; (km, mi or hours)
    'Athlete club':                 'Club',         # str
    'Athlete country':              'Country',      # str
    'Athlete year of birth':        'Birth',        # float
    'Athlete gender':               'Gender',       # int (M=0, F=1)
    'Athlete age category':         'Age',          # int
    'Athlete average speed':        'Average_speed',# float
    'Athlete ID':                   'Id'            # int
}, inplace=True)



# Drop entries with null values
# print(ultra_marathon.isnull().sum())
# ultra_marathon.dropna(subset=['Age', 'Birth'], inplace=True)


# Remove unnecesary information
ultra_marathon['Age'] = ultra_marathon['Age'].str.extract(r'(\d+)')
allowed_values = ['50km', '100km', '50mi', '100mi', '6h', '12h', '24h', '48h', '72h', '6d', '10d']
ultra_marathon = ultra_marathon[ultra_marathon['Length'].isin(allowed_values)]
# ultra_marathon = ultra_marathon[ultra_marathon['Gender'].notna() & (ultra_marathon['Gender'] != 'X')] # Eliminate the X and nan values
ultra_marathon = ultra_marathon[ultra_marathon['Gender'] != 'X']          # Eliminate the X values
ultra_marathon['Gender'] = ultra_marathon['Gender'].map({'M': 0, 'F': 1}) # Encode gender to 0 and 1


# Parse into numeric data types
ultra_marathon['Year']          = pd.to_numeric(ultra_marathon['Year'], errors='coerce')
ultra_marathon['Finishers']     = pd.to_numeric(ultra_marathon['Finishers'], errors='coerce')
ultra_marathon['Birth']         = pd.to_numeric(ultra_marathon['Birth'], errors='coerce')
ultra_marathon['Age']           = pd.to_numeric(ultra_marathon['Age'], errors='coerce')
ultra_marathon['Average_speed'] = pd.to_numeric(ultra_marathon['Average_speed'], errors='coerce')
ultra_marathon['Id']            = pd.to_numeric(ultra_marathon['Id'], errors='coerce', downcast='integer')
# ultra_marathon.dropna(subset=['Year', 'Finishers', 'Birth', 'Age', 'Average_speed', 'Id'], inplace=True)


# Custom parsing

def parseDates(given_string):
    # The possible formats are: '00.01.2000', '01.-05.01.2000', '31.01.-01.02.2000', '31.12.2000-01.01.2001'
    try:
        return [datetime.strptime(given_string, "%d.%m.%Y")]
    except:
        pass
    try:
        interval = given_string.split('-')
        for separation_length in [3, 6, 10]:
            if len(interval[0]) == separation_length:
                first_day = datetime.strptime(interval[0] + interval[1][separation_length:], "%d.%m.%Y")
                last_day = datetime.strptime(interval[1], "%d.%m.%Y")
                # return [first_day + timedelta(days=i) for i in range((last_day - first_day).days + 1)]
                return [first_day, last_day]
    except:
        # None parsable & existent dates such as 00.01.2000
        return np.nan


# Returns (int value, int stages), where stage=0 indicates hours and any other stage indicates number of race stages
def parseLengths(given_string):
    if pd.isna(given_string):
        return (np.nan, np.nan)
    
    if "h" in given_string:
        try:
            return(float(re.search(r'(\d+)h', given_string)[1]), 0)
        except AttributeError:
            return (np.nan, np.nan)
    elif "d" in given_string:
        try:
            return(24 * float(re.search(r'(\d+)d', given_string)[1]), 0)
        except AttributeError:
            return (np.nan, np.nan)

    if "mi" in given_string:
        distance = round((1.60934 * float(re.search(r'(\d+(\.\d+)?)mi', given_string)[1])), 2)
    else:
        distance = float(re.search(r'(\d+(\.\d+)?)km', given_string)[1])
    stages = re.search(r'(\d+)Etappen', given_string)

    try:
        if stages == None:
            return (distance, 1)
        else:
            return (distance, int(stages[1]))
    except:
        return (np.nan, np.nan)
    
# Apply custom parsing
ultra_marathon['Dates'] = ultra_marathon['Dates'].apply(parseDates)
ultra_marathon['Dates'].dropna(inplace=True)    # Only drops 593 rows
# ultra_marathon[['Length', 'Stages']] = ultra_marathon['Length'].apply(parseLengths).apply(pd.Series)


# Recast column types (reduces memory usage by 25%)
ultra_marathon['Year'] = ultra_marathon['Year'].astype(pd.Int16Dtype())
ultra_marathon['Finishers'] = ultra_marathon['Finishers'].astype(pd.Int32Dtype())
ultra_marathon['Birth'] = ultra_marathon['Birth'].astype(pd.Int16Dtype())
ultra_marathon['Gender'] = ultra_marathon['Gender'].astype(pd.Int8Dtype())
ultra_marathon['Age'] = ultra_marathon['Age'].astype(pd.Int8Dtype())
ultra_marathon['Average_speed'] = ultra_marathon['Average_speed'].astype(pd.Float32Dtype())


# Display information
print(ultra_marathon.info())
ultra_marathon.head(3)


In [None]:
# Data exploration

print(ultra_marathon.info())

# Summary statistics
# print(ultra_marathon.describe(include='all')) # include='all' includes non-numeric columns

# print unique values for categorical columns
print('Event name:', ultra_marathon['Event_name'].unique())
print('Length:', ultra_marathon['Length'].unique())
print('Club:', ultra_marathon['Club'].unique())
print('Country:', ultra_marathon['Country'].unique())
print('Gender:', ultra_marathon['Gender'].unique())

print(ultra_marathon['Average_speed'].head(10))
# print the highest average speed
print(ultra_marathon['Average_speed'].max())
# print the median average speed
print(ultra_marathon['Average_speed'].median())
# print the shape of the dataframe
print(ultra_marathon.shape)
# print the number of rows with average speed below 20
print(ultra_marathon[ultra_marathon['Average_speed'] < 20].shape[0])

In [None]:
# Explore how performance change based on distance and demographics
# Scatter plot

# Copy the dataframe
performance = ultra_marathon.copy()

# Drop the values of average speed that are above 30
performance = performance[performance['Average_speed'] <= 21]

# Drop the Nan values for Gender
performance = performance.dropna(subset='Gender')

# Define the order for "Length" based on the provided list
length_order = ['50km', '100km', '50mi', '100mi', '6h', '12h', '24h', '48h', '72h', '6d', '10d']

# Ensure the Length column is categorical and ordered
performance['Length'] = pd.Categorical(performance['Length'], categories=length_order, ordered=True)

# Sort the DataFrame by the Length to respect the order in the plot
performance = performance.sort_values(by='Length')

# Map gender to strings
gender_map = {0: 'Male', 1: 'Female'}
performance['Gender'] = performance['Gender'].map(gender_map)

# Create a FacetGrid, plotting average speed by age for each gender across different race lengths
g = sns.FacetGrid(performance, col='Length', hue='Gender', col_wrap=4, height=3, sharex=True, sharey=True)
g.map(sns.scatterplot, 'Age', 'Average_speed')

# Add some aesthetic adjustments and titles
g.add_legend(title='Gender')
g.set_axis_labels('Age', 'Average Speed (km/h)')
g.set_titles("{col_name}")
g.fig.suptitle('Average Speed by Age and Gender across Different Race Lengths', y=1.02)

# Show the plot
plt.show()

In [None]:
# Explore how perforamnce change based on distance and demographics
# Box plot for gender

# Copy the dataframe
performance = ultra_marathon.copy()

# Drop the values of average speed that are above 30
performance = performance[performance['Average_speed'] <= 21]

# Drop the Nan values for Gender
performance = performance.dropna(subset='Gender')

# Define the order for "Length" based on the provided list
length_order = ['50km', '100km', '50mi', '100mi', '6h', '12h', '24h', '48h', '72h', '6d', '10d']

# Ensure the Length column is categorical and ordered
performance['Length'] = pd.Categorical(performance['Length'], categories=length_order, ordered=True)

# Sort the DataFrame by the Length to respect the order in the plot
performance = performance.sort_values(by='Length')

# Map gender to strings
gender_map = {0: 'Male', 1: 'Female'}
performance['Gender'] = performance['Gender'].map(gender_map)

# New: Segment the age into categorical groups
bins = [0, 18, 30, 40, 50, 60, 70, 100]  # Adjust bins as needed
labels = ['<18', '19-30', '31-40', '41-50', '51-60', '61-70', '>70']
performance['Age_group'] = pd.cut(performance['Age'], bins=bins, labels=labels, right=False)

# Increase font size
plt.rcParams.update({'font.size': 15})

# Box Plot
plt.figure(figsize=(14, 8))
sns.boxplot(data=performance, x='Length', y='Average_speed', hue='Gender', notch=True)
plt.xticks(rotation=45)
plt.title('Distribution of Average Speeds by Gender across Race Lengths')
plt.xlabel('Race Length/Time')
plt.ylabel('Average Speed (km/h)')

In [None]:
# Box Plot for age group

# Copy the dataframe
performance = ultra_marathon.copy()

# Drop the values of average speed that are above 30
performance = performance[performance['Average_speed'] <= 21]

# Define the order for "Length" based on the provided list
length_order = ['50km', '100km', '50mi', '100mi', '6h', '12h', '24h', '48h', '72h', '6d', '10d']

# Ensure the Length column is categorical and ordered
performance['Length'] = pd.Categorical(performance['Length'], categories=length_order, ordered=True)

# Sort the DataFrame by the Length to respect the order in the plot
performance = performance.sort_values(by='Length')

# Segment the age into categorical groups
bins = [18, 30, 40, 50, 60, 70, 100]  # Adjust bins as needed
labels = ['19-30', '31-40', '41-50', '51-60', '61-70', '>70']
performance['Age_group'] = pd.cut(performance['Age'], bins=bins, labels=labels, right=False)

# Increase font size
plt.rcParams.update({'font.size': 15})

# Box Plot for Age Group Comparison
plt.figure(figsize=(14, 8))
sns.boxplot(data=performance, x='Length', y='Average_speed', hue='Age_group', palette='Spectral', notch=True)
plt.xticks(rotation=45)
plt.title('Distribution of Average Speeds by Age Group across Race Lengths')
plt.xlabel('Race Length/Time')
plt.ylabel('Average Speed (km/h)')
plt.legend(title='Age Group', loc='upper right', bbox_to_anchor=(1, 1))
plt.tight_layout()
plt.show()

In [None]:
# Extract a new column called season from the Dates column
# Copy the dataframe
ultra_marathon_with_seasons = ultra_marathon.copy()

# Extract the month from the Dates column (the object is a list of this type  [2018-01-06 00:00:00])
ultra_marathon_with_seasons['Season'] = ultra_marathon_with_seasons['Dates'].apply(lambda x: x[0].month if isinstance(x, list) else None)
# remove the null values
ultra_marathon_with_seasons = ultra_marathon_with_seasons.dropna(subset=['Season'])
# Conver it to int
ultra_marathon_with_seasons['Season'] = ultra_marathon_with_seasons['Season'].astype(int)
# Map the season to the month
season_map = {1: 'Winter', 2: 'Winter', 3: 'Spring', 4: 'Spring', 5: 'Spring', 6: 'Summer', 7: 'Summer', 8: 'Summer', 9: 'Autumn', 10: 'Autumn', 11: 'Autumn', 12: 'Winter'}
ultra_marathon_with_seasons['Season'] = ultra_marathon_with_seasons['Season'].map(season_map)

# Map the season to the month
season_map = {1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 3, 7: 3, 8: 3, 9: 4, 10: 4, 11: 4, 12: 1}
ultra_marathon_with_seasons['Season'] = ultra_marathon_with_seasons['Season'].map(season_map)

# Print the percentage of each season
print(ultra_marathon_with_seasons['Season'].value_counts(normalize=True) * 100)

In [None]:
# Explore the characteristics of the most popular event
# Copy the dataframe
popular_event = ultra_marathon.copy()

# Count the number of participants (rows) for each event without summing a 'Finishers' column
event_popularity = ultra_marathon.groupby('Event_name').size().reset_index(name='Occurrences')

# Sort by the number of participants to find the most popular events
event_popularity_sorted = event_popularity.sort_values(by='Occurrences', ascending=False)

# Display the top 10 most popular events based on participant count
top_events = event_popularity_sorted.head(10)

# Print this event Event_name  Year Two Oceans Marathon - 50km Split (RSA)  2019 finsihers
specific_event = popular_event[(popular_event['Event_name'] == 'Two Oceans Marathon - 50km Split (RSA)') & (popular_event['Year'] == 2019)]

print(top_events)

In [None]:
# The linear regression

from sklearn.preprocessing import LabelEncoder

ultra_marathon_encoded = ultra_marathon.copy()
ultra_marathon_encoded.dropna(subset=['Year', 'Finishers', 'Birth', 'Age', 'Average_speed', 'Id', 'Gender'], inplace=True)

country_encoder = LabelEncoder()
event_encoder = LabelEncoder()

# Extract the month from the Dates column (the object is a list of this type  [2018-01-06 00:00:00])
ultra_marathon_encoded['Season'] = ultra_marathon_encoded['Dates'].apply(lambda x: x[0].month if isinstance(x, list) else None)
# remove the null values
ultra_marathon_encoded = ultra_marathon_encoded.dropna(subset=['Season'])
# Conver it to int
ultra_marathon_encoded['Season'] = ultra_marathon_encoded['Season'].astype(int)

# Map the season to the month
season_map = {1: 1, 2: 1, 3: 2, 4: 2, 5: 2, 6: 3, 7: 3, 8: 3, 9: 4, 10: 4, 11: 4, 12: 1}
ultra_marathon_encoded['Season'] = ultra_marathon_encoded['Season'].map(season_map)

# Print the percentage of each season
print(ultra_marathon_encoded['Season'].value_counts(normalize=True) * 100)


# Since the countries aren't of the appropriate datatype (they're str), we encode each country to a Country_ID
# Drop the values of average speed that are above 21, and hot encode (map) each country to an ID
ultra_marathon_encoded = ultra_marathon_encoded[ultra_marathon_encoded['Average_speed'] <= 21]
ultra_marathon_encoded['Country_ID'] = country_encoder.fit_transform(ultra_marathon_encoded['Country'])

# Drop any events prior to 1972, and hot encode (map) the events as well to be able to perform the regression
ultra_marathon_encoded = ultra_marathon_encoded[ultra_marathon_encoded['Year'] >= 1972]
ultra_marathon_encoded['Event_name'] = event_encoder.fit_transform(ultra_marathon_encoded['Event_name'])

# Also encode each race category/type to an index from 1
lengths = ['50km', '100km', '50mi', '100mi', '6h', '12h', '24h', '48h', '72h', '6d', '10d']
length_index = {length: i for i, length in enumerate(lengths, start=1)}

# The actual mapping
def map_length_to_index(given_string):
    if pd.isna(given_string):
        return np.nan
    else:
        for length in lengths:
            if length in given_string:
                return length_index[length]
        return np.nan

ultra_marathon_encoded['Length_Index'] = ultra_marathon_encoded['Length'].apply(map_length_to_index)

selected_columns = ['Age', 'Gender', 'Country_ID', 'Length_Index', 'Year', 'Season', 'Event_name', 'Average_speed']
ultra_marathon_selected = ultra_marathon_encoded[selected_columns]
ultra_marathon_selected.rename(columns={'Age': 'Age', 'Gender': 'Gender', 'Country_ID': 'Country', 'Length_Index': 'Category'}, inplace=True)

# Compute the correlation between each independent variable (covariate) and the target variable (average speed)
correlation_with_avg_speed = ultra_marathon_selected.corr()

# Plotting the heatmap/correlation matrix 
plt.figure(figsize=(10, 8))
ax = sns.heatmap(correlation_with_avg_speed, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap')
plt.xticks(rotation=45)
ax.tick_params(axis='x', labelsize=12)
ax.tick_params(axis='y', labelsize=12)
plt.show()

# Define the formula for linear regression
formula = 'Average_speed ~ Age + Gender + Country + Season + Year + Event_name + Category'

# Fit the linear regression model
model = smf.ols(formula=formula, data=ultra_marathon_selected).fit()

# Print the summary of the regression model
print(model.summary())

# Now predicting values in order to plot
# ultra_marathon_selected['Predicted_speed'] = model.predict(ultra_marathon_selected)

# Plotting

'''
# Plot the actual vs. predicted speeds against each other
plt.figure(figsize=(10, 6))
sns.regplot(x='Average_speed', y='Predicted_speed', data=ultra_marathon_selected, scatter_kws={'s': 10}, line_kws={"color": "red"})
plt.xlabel('Actual Average Speed')
plt.ylabel('Predicted Average Speed')
plt.title('Actual vs Predicted Average Speed')
plt.show()
'''

In [None]:
# ------------------------------ TOP 5 ----------------------------------
# Filtering the dataframe by the top 5 most popular events only, as specified from 2 cells above-ish
ultra_marathon_selected_2 = ultra_marathon_selected.copy()
top_5_events = [
    '100 km Lauf Biel (SUI)', 
    'Two Oceans Marathon - 50km Split (RSA)', 
    '100 km del Passatore, Firenze-Faenza (ITA)', 
    'Les 100 km de Millau (FRA)', 
    'Yukihashi to Beppu 100 km (JPN)'
]
top_5_events_encoded = event_encoder.transform(top_5_events)
ultra_marathon_filtered_top_5 = ultra_marathon_selected_2[ultra_marathon_selected_2['Event_name'].isin(top_5_events_encoded)]

# Compute the correlation between each independent variable (covariate) and the target variable (average speed)
correlation_with_avg_speed_top_5 = ultra_marathon_filtered_top_5.corr()

# Plotting the heatmap/correlation matrix 
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_with_avg_speed_top_5, annot=True, cmap='coolwarm', fmt=".2f")
plt.xticks(rotation=45)
plt.title('Correlation Heatmap')
plt.show()

# Running the regression again, now only for these top 5 events
formula_filtered = 'Average_speed ~ Age + Gender + Country + Season + Year + Category'
model_filtered = smf.ols(formula=formula_filtered, data=ultra_marathon_filtered_top_5).fit()
r_squared_top_6 = model_filtered.rsquared
print("R-squared (Filtered by Top 5 Model):", r_squared_top_6)
print(model_filtered.summary())


# ------------------------------ TOP 20 ----------------------------------
'''
# Now running the regression on the top 20 events
ultra_marathon_selected_3 = ultra_marathon_selected.copy()
top_20_events = [
    "100 km Lauf Biel (SUI)",
    "Two Oceans Marathon - 50km Split (RSA)",
    "100 km del Passatore, Firenze-Faenza (ITA)",
    "Les 100 km de Millau (FRA)",
    "Yukihashi to Beppu 100 km (JPN)",
    "Om Die Dam 50km (RSA)",
    "JFK 50 Mile (USA)",
    "Lake Saroma 100 Km Ultramarathon (JPN)",
    "Oxfam Trailwalker Hong Kong (HKG)",
    "River Shimanto 100km (JPN)",
    "American River 50 Mile Endurance Run (USA)",
    "Loskop Ultra Marathon 50km (RSA)",
    "Way Too Cool 50K Endurance Run (USA)",
    "Yatsugatake Nobeyama 100 km (JPN)",
    "100 km Lauf Unna (GER)",
    "Tango 100 km Ultramarathon (JPN)",
    "Hong Kong 100 Ultra Trail Race (HKG)",
    "Western States 100 Mile Endurance Run (USA)",
    "50 km di Romagna (ITA)",
    "100 km du Périgord Noir, Belves (FRA)"
]
top_20_events_encoded = label_encoder.transform(top_20_events)
ultra_marathon_filtered_top_20 = ultra_marathon_selected_3[ultra_marathon_selected_3['Event_name'].isin(top_20_events_encoded)]

# Running the regression again, now only for the top 20 events
formula_filtered_top_20 = 'Average_speed ~ Age + Gender + Country + Season + Year + Category'
model_filtered_top_20 = smf.ols(formula=formula_filtered_top_20, data=ultra_marathon_filtered_top_20).fit()
r_squared_top_20 = model_filtered_top_20.rsquared
print("R-squared (Filtered by Top 20 Model):", r_squared_top_20)
'''

In [None]:
# Have there been any significant shifts in regional participation patterns?
import pycountry_convert as pc

def country_to_continent(country_iso):
    try:
        continent_code = pc.country_alpha2_to_continent_code(pc.country_alpha3_to_country_alpha2(country_iso))
        return pc.convert_continent_code_to_continent_name(continent_code)
    except:
        return np.nan


erased_p = ultra_marathon[ultra_marathon['Year'] < 1972].shape[0]/ultra_marathon[ultra_marathon['Year'] >= 1972].shape[0]
print("We only take into account the values for the last 50 years, erasing ", round(erased_p*100, 3), "% of the data")

# Create appropriate dataset
ultra_marathon_region = ultra_marathon.copy()
ultra_marathon_region['Continent'] = ultra_marathon_region['Country'].apply(country_to_continent)
ultra_marathon_region = ultra_marathon_region[ultra_marathon_region['Year'] >= 1900]


# Get participants per year & continent (one row per data)
participants_d = ultra_marathon_region.groupby(['Year', 'Continent']).size().reset_index(name='Participants')
# Get participants per year & continent (one row per year)
participants_y = participants_d.pivot(index='Year', columns='Continent', values='Participants')
participants_y.fillna(0, inplace=True)
participants_perc = participants_y.div(participants_y.sum(axis=1), axis=0) * 100
# Relative participants on the last 50 years
participants_perc_50 = participants_perc[participants_perc.index >= 1972]



# ____________________________Plotting__________________________________________


# Plot the stacked area chart for percentage distribution since 1972
sns.set_palette("colorblind")
plt.figure(figsize=(14, 7))
participants_perc_50.plot(kind='area', stacked=True, legend=False)
# Plot characteristics
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc='upper left', bbox_to_anchor=(1, 1))
plt.xticks(np.arange(participants_perc_50.index.min(), participants_perc_50.index.max() + 1, 10))
plt.title('Stacked Relative Proportion of Participants by Continent on The Last 50 Years')
plt.xlabel('Year')
plt.ylabel('Participants (%)')

plt.show()


In [None]:
# Have there been any significant shifts in regional participation patterns?
import pycountry_convert as pc

def country_to_continent(country_iso):
    try:
        continent_code = pc.country_alpha2_to_continent_code(pc.country_alpha3_to_country_alpha2(country_iso))
        return pc.convert_continent_code_to_continent_name(continent_code)
    except:
        return np.nan

# Relevant statistics
erased_p = ultra_marathon[ultra_marathon['Year'] < 1972].shape[0]/ultra_marathon[ultra_marathon['Year'] >= 1972].shape[0]
print("We only take into account the values for the last 50 years, erasing ", round(erased_p*100, 3), "% of the data")

participants_1972 = ultra_marathon[ultra_marathon['Year'] == 1972].shape[0]
participants_2022 = ultra_marathon[ultra_marathon['Year'] == 2022].shape[0]
increase = (participants_2022 - participants_1972) / participants_1972
print("The number of participants increased by a factor of", round(increase, 3), " over the last 50 years")


# Create appropriate dataset
ultra_marathon_region = ultra_marathon.copy()
ultra_marathon_region['Continent'] = ultra_marathon_region['Country'].apply(country_to_continent)
ultra_marathon_region = ultra_marathon_region[ultra_marathon_region['Year'] >= 1900]


# Get participants per year & continent (one row per data)
participants_d = ultra_marathon_region.groupby(['Year', 'Continent']).size().reset_index(name='Participants')
# Get participants per year & continent (one row per year)
participants_y = participants_d.pivot(index='Year', columns='Continent', values='Participants')
participants_y.fillna(0, inplace=True)
participants_perc = participants_y.div(participants_y.sum(axis=1), axis=0) * 100
# Relative participants on the last 50 years
participants_perc_50 = participants_perc[participants_perc.index >= 1972]



# ____________________________Plotting__________________________________________


# Plot the stacked area chart for percentage distribution since 1972
plt.figure(figsize=(10, 10))
participants_perc_50.plot(kind='area', stacked=True, legend=False, color=sns.color_palette("Set2"))
# Plot characteristics
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc='upper left', bbox_to_anchor=(1, 1))
plt.xticks(np.arange(participants_perc_50.index.min(), participants_perc_50.index.max() + 1, 10))
plt.title("Stacked Proportion for the Participant's Continent of Origin Over the Past 50 Years")
plt.xlabel('Year')
plt.ylabel('Participants (%)')
plt.show()


# Plot absolute values
participants_y = participants_y[participants_y.index >= 1972]
plt.figure(figsize=(10, 10))
participants_y.plot(kind='area', stacked=True, color=sns.color_palette("Set2"))
# Plot characteristics
handles, labels = plt.gca().get_legend_handles_labels()
plt.legend(handles[::-1], labels[::-1], loc='upper left', bbox_to_anchor=(1, 1))
plt.xticks(np.arange(participants_perc_50.index.min(), participants_perc_50.index.max() + 1, 10))
plt.title("Stacked Number of Participants by Continent Over the Past 50 Years")
plt.xlabel('Year')
plt.ylabel('Participants')
plt.show()




plt.rcParams.update({'font.size': 15, 'legend.fontsize': 15})
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 7))  # One row, two columns
# fig.suptitle('Stacked Proportion and Number of Participants by Continent Over the Past 50 Years', fontsize=15, va='bottom')

# Plot the stacked area distribution
participants_perc_50.plot(kind='area', stacked=True, legend=False, ax=ax1, color = sns.color_palette("Set2"))
ax1.set_xticks(np.arange(participants_perc_50.index.min(), participants_perc_50.index.max() + 1, 10))
ax1.set_title("Proportion of Participants' Origin Over the Past 50 Years", fontsize=15.5)
ax1.set_xlabel('Year')
ax1.set_ylabel('Participants (%)')

# Plot absolute values
participants_y = participants_y[participants_y.index >= 1972]
participants_y.plot(kind='area', stacked=True, legend=False, ax=ax2, color = sns.color_palette("Set2"))
ax2.set_xticks(np.arange(participants_y.index.min(), participants_y.index.max() + 1, 10))
ax2.set_title("Number of Participants per Origin Over the Past 50 Years", fontsize=15.5)
ax2.set_xlabel('Year')
ax2.set_ylabel('Participants (Number)')

# Create legend
handles, labels = ax1.get_legend_handles_labels()
legend = fig.legend(handles, labels, loc='center', bbox_to_anchor=(0.5, 0), ncol=len(labels), frameon=True, fancybox=True)

# Adjust layout & show plot
plt.tight_layout(rect=[0, 0, 1, 1])
plt.show()


In [None]:
# Analyse the number of recurrent athletes in the five most popular events.
# Copy the dataframe
def categorize_participation(count):
    if count == 1:
        return '1 repeat'
    elif 2 <= count <= 3:
        return '2-3 repeats'
    elif 4 <= count <= 5:
        return '4-5 repeats'
    elif 6 <= count <= 8:
        return '6-8 repeats'
    elif 8 <= count <= 10:
        return '8-10 repeats'
    else:
        return '> 10 repeats'


# Correctly rename aggregated columns to avoid multi-level columns
ultra_marathon_copy = ultra_marathon.copy()
length_order = ['50km', '100km', '50mi', '100mi', '6h', '12h', '24h', '48h', '72h', '6d', '10d']
ultra_marathon_copy = ultra_marathon[ultra_marathon['Average_speed'] <= 21]

athlete_summary = ultra_marathon_copy.groupby('Id').agg({
    'Year': ['min', 'max', 'count'],
    'Length': ['min', 'max'],
    'Average_speed': 'mean',
}).reset_index()

# Flatten the column names by joining with underscore
athlete_summary.columns = ['_'.join(col).strip() for col in athlete_summary.columns.values]
athlete_summary.rename(columns={'Id_': 'Id'}, inplace=True)  # Correcting the column name after flattening

# Now apply the categorization with the corrected column name
athlete_summary['Participation_Category'] = athlete_summary['Year_count'].apply(categorize_participation)

# Merge to associate each event with a participation category
ultra_marathon_with_category = pd.merge(ultra_marathon_copy, athlete_summary[['Id', 'Participation_Category']], on='Id')

ultra_marathon_with_category['Length'] = pd.Categorical(ultra_marathon_with_category['Length'], categories=length_order, ordered=True)

# Group by 'Length' and 'Participation_Category' for average speed
speed_by_event_and_category = ultra_marathon_with_category.groupby(['Length', 'Participation_Category'])['Average_speed'].mean().unstack()

sns.set_palette("colorblind")

speed_by_event_and_category.plot(kind='bar', figsize=(14, 8), width=0.8)
plt.title('Average Speed by Event and Athlete Participation Category')
plt.xlabel('Event Length')
plt.ylabel('Average Speed (km/h or equivalent)')
plt.xticks(rotation=45)
plt.legend(title='Participation Category')
plt.tight_layout()

plt.show()