# Quick Data Visualization on UTMB 2022 results
https://live.utmb.world/utmb/utmb

In [1]:
import json
import chardet
import plotly.express as px
import matplotlib.pyplot as plt
import statistics
import functools

## Data collection
Data has been collected in a separated script. We have here available the rankings of all the finishers, with information for every runner

In [2]:
File="../data/rankings.json"
enc=chardet.detect(open(File,'rb').read())['encoding']

# Opening JSON file
with open(File,'r', encoding = enc) as f:
    data=json.load(f)
    f.close()
    
runners = data['runners']
# printing first runner as a sample
runners[0]

{'raceId': 'utmb',
 'raceName': 'UTMB®',
 'raceCategory': '100m',
 'status': 'f',
 'bib': 1,
 'info': {'fullname': 'Kilian JORNET BURGADA',
  'initials': 'KJ',
  'age': '35',
  'countryCode': 'ES',
  'category': '35-39M',
  'sex': 'H',
  'club': 'NNORMAL ',
  'index': 945,
  'photo': 'worldseries/Members/f6b6bdde-8e09-4933-9423-351fb19c3f05',
  'photoWebTv': 'worldseries/Members/Admin/JORNET_BURGADA.KILIAN_wglfbe',
  'url': 'https://utmb.world/runner/2704.kilian.jornetburgada',
  'teamMembers': None},
 'ranking': {'scratch': '1', 'sex': '1', 'category': '1'},
 'lastLocation': None,
 'prediction': {'lastPointId': 143,
  'lastPassing': '2022-08-27T14:01:10.810Z',
  'nextPointId': None,
  'nextPointPrediction': None},
 'raceTime': '19:49:30',
 'start': '2022-08-26T15:59:57.000Z',
 'isFinisher': True,
 'diffToFirst': '00:00:00'}

We want to slightly modify the content of the data we have, to better process it. We will flatten our ranking, so that we have one entity for every runner, with no sub-objects

In [3]:
def flatten_json(y):
    out = {}
    def flatten(x, name=''):
        # If the Nested key-value pair is of dict type
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        # If the Nested key-value pair is of list type
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x
    flatten(y)
    return out

 
# Driver code
runners_flattened = []
for r in runners:
    runners_flattened.append(flatten_json(r))

We are now converting our information into a dataframe, to easily use the tools to visualize the data

In [4]:
import pandas as pd

# options to be activated for developing and debugging
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

df = pd.DataFrame(runners_flattened)
numeric_columns = ["bib", "info_age", "info_index", "ranking_scratch", "ranking_sex", "ranking_category"]
for col in numeric_columns:
    df[col] = pd.to_numeric(df[col])
    
datetime_columns = ["prediction_lastPassing", "start"]
for col in datetime_columns:
    df[col] = pd.to_datetime(df[col])
    

df["diffToFirst"] = pd.to_timedelta(df["diffToFirst"])
df["raceTime"] = pd.to_timedelta(df["raceTime"])
    
df

Unnamed: 0,raceId,raceName,raceCategory,status,bib,info_fullname,info_initials,info_age,info_countryCode,info_category,...,ranking_category,lastLocation,prediction_lastPointId,prediction_lastPassing,prediction_nextPointId,prediction_nextPointPrediction,raceTime,start,isFinisher,diffToFirst
0,utmb,UTMB®,100m,f,1,Kilian JORNET BURGADA,KJ,35,ES,35-39M,...,1,,143,2022-08-27 14:01:10.810000+00:00,,,0 days 19:49:30,2022-08-26 15:59:57+00:00,True,0 days 00:00:00
1,utmb,UTMB®,100m,f,41,Mathieu BLANCHARD,MB,35,FR,35-39M,...,2,,143,2022-08-27 12:37:44.200000+00:00,,,0 days 19:54:50,2022-08-26 15:59:57+00:00,True,0 days 00:05:20
2,utmb,UTMB®,100m,f,11,Thomas EVANS,TE,30,GB,20-34M,...,1,,143,2022-08-27 13:39:57.100000+00:00,,,0 days 20:34:35,2022-08-26 15:59:57+00:00,True,0 days 00:45:05
3,utmb,UTMB®,100m,f,3,Jim WALMSLEY,JW,32,US,20-34M,...,2,,143,2022-08-27 13:36:10.710000+00:00,,,0 days 21:12:12,2022-08-26 15:59:57+00:00,True,0 days 01:22:42
4,utmb,UTMB®,100m,f,33,Zach MILLER,ZM,34,US,20-34M,...,3,,143,2022-08-27 15:04:13.150000+00:00,,,0 days 21:27:50,2022-08-26 15:59:57+00:00,True,0 days 01:38:20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1784,utmb,UTMB®,100m,f,2783,Pierre GRIFFATON,PG,50,FR,50-54M,...,233,,143,2022-08-28 14:24:07.660000+00:00,,,1 days 22:24:10,2022-08-26 15:59:57+00:00,True,1 days 02:34:40
1785,utmb,UTMB®,100m,f,2801,Radu GULIE,RG,41,RO,40-44M,...,410,,143,2022-08-28 14:47:51.290000+00:00,,,1 days 22:26:23,2022-08-26 15:59:57+00:00,True,1 days 02:36:53
1786,utmb,UTMB®,100m,f,861,Roxana DIRVAREANU,RD,39,RO,35-39F,...,22,,143,2022-08-28 14:47:44.490000+00:00,,,1 days 22:26:24,2022-08-26 15:59:57+00:00,True,1 days 02:36:54
1787,utmb,UTMB®,100m,f,2347,Masahiro TAZO,MT,55,JP,55-59M,...,101,,143,2022-08-28 14:57:40.560000+00:00,,,1 days 22:48:44,2022-08-26 15:59:57+00:00,True,1 days 02:59:14


In [5]:
def stats(sample):
    sample = sample.dropna()
    print("{:8}: {:4}\n{:8}: {:4.1f}\n{:8}: {:4}\n{:8}: {:4}\n{:8}: {:4}\n{:8}: {:4}"
          .format(
              "Count",
              sample.count(),
              "Average",
              statistics.mean(sample),
              "Median",
              statistics.median(sample),
              "Mode",
              statistics.mode(sample),
              "Lower",
              functools.reduce(lambda a, b : a if a <= b else b, sample),
              "Higher",
              functools.reduce(lambda a, b : a if a >= b else b, sample)
          )
         )
    
stats(df["info_age"])

Count   : 1789
Average : 43.7
Median  :   44
Mode    :   45
Lower   :   22
Higher  :   74


## Visualization
We can now start exploring some plots. Let's see the age of the runners based on their ranking, and let's focus on the first 200 finishers.

`ranking_scratch` represents the final ranking of the runner

### Age of athletes

In [8]:
fig = px.line(df, x='ranking_scratch', y="info_age")
fig.show()
stats(df["info_age"])

print("Same analysis for top 200 finishers:")
fig = px.line(df[:200], x='ranking_scratch', y="info_age")
fig.show()
stats(df[:200]["info_age"])

Count   : 1789
Average : 43.7
Median  :   44
Mode    :   45
Lower   :   22
Higher  :   74
Same analysis for top 200 finishers:


Count   :  200
Average : 38.4
Median  : 38.0
Mode    :   37
Lower   :   27
Higher  :   59


### UTMB index

In [7]:
fig = px.line(df, x='ranking_scratch', y="info_index")
fig.show()
stats(df["info_index"])

print("Same analysis for top 200 finishers:")
fig = px.line(df[:200], x='ranking_scratch', y="info_index")
fig.show()
stats(df[:200]["info_index"])

Count   : 1708
Average : 572.4
Median  : 559.0
Mode    : 589.0
Lower   : 350.0
Higher  : 945.0
Same analysis for top 200 finishers:


Count   :  199
Average : 741.3
Median  : 732.0
Mode    : 832.0
Lower   : 567.0
Higher  : 945.0


### Time of arrival

The time difference from the first arrived. Not sure about the unit of measurement... it's an automatic conversion from type deltaTime, so could be something like milliseconds. Don't really care, I was interested into the curve

In [8]:
fig = px.line(df, x='ranking_scratch', y="diffToFirst")
fig.show()

print("Same analysis for top 200 finishers:")
fig = px.line(df[:200], x='ranking_scratch', y="diffToFirst")
fig.show()

Same analysis for top 200 finishers:


### Number of finishers per category

In [9]:
fig = px.histogram(df, x='info_category')
fig.show()

print("Same analysis for top 200 finishers:")
fig = px.histogram(df[:200], x='info_category')
fig.show()

Same analysis for top 200 finishers:


### Most present countries
Number of finishers per countries

In [10]:
fig = px.histogram(df, x='info_countryCode')
# fig.show()

country_list = df['info_countryCode']

# create a dataframe where each char is one row
df_c = pd.DataFrame({'countries': country_list})
# add a column for aggregation later
df_c['num'] = 1
# group rows by character type, count the occurences in each group and sort by occurance
df_c = df_c.groupby('countries').sum().sort_values('num', ascending=False)
df_c.reset_index(inplace=True)

fig = px.histogram(df_c[:30], x='countries', y='num')
fig.show()