In [1]:
import numpy as np 
import pandas as pd 
from scipy import stats 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_excel('LLHM2023.xlsx').fillna(0) # create dataframe and fill na with 0
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'LLHM2023.xlsx'

In [None]:
# column names
df.columns

In [None]:
# basic info
df.info()

In [None]:
# basic stats
df.describe().T

### Data Preparation:

In [None]:
data = df[['Chiptime','Split - 5K - Cumulative time','Split - 10K - Cumulative time','Split - 15K - Cumulative time','Split - 20K - Cumulative time','Gender','Avg speed','Category']] # create dataframe for analysis, selecting specific attributes.

In [None]:
data.columns

In [None]:
data.info()

**Datetime columns:**
* Convert time objects to second integers for later analysis.
* Remove data that does not indicate Chiptime, to include recorded completion times only.

In [None]:
data['Chiptime Seconds'] = pd.TimedeltaIndex(data['Chiptime'].astype("str")).total_seconds().astype(int) # create seconds column
data = data[data['Chiptime Seconds'] > 0] # remove 0 value rows to only show registered completed particpants
data.loc[:, 'Split - 20K - Cumulative time'] = pd.TimedeltaIndex(data['Split - 20K - Cumulative time'].astype("str")).total_seconds().astype(int) # convert to seconds
data.loc[:, 'Split - 10K - Cumulative time'] = pd.TimedeltaIndex(data['Split - 10K - Cumulative time'].astype("str")).total_seconds().astype(int)# convert to seconds
data.loc[:, 'Split - 15K - Cumulative time'] = pd.TimedeltaIndex(data['Split - 15K - Cumulative time'].astype("str")).total_seconds().astype(int) # convert to seconds
data.loc[:, 'Split - 5K - Cumulative time'] = pd.TimedeltaIndex(data['Split - 5K - Cumulative time'].astype("str")).total_seconds().astype(int) # convert to seconds
   

In [None]:
data['Split - 20K - Cumulative time'] = pd.to_numeric(data['Split - 20K - Cumulative time'])
data['Split - 15K - Cumulative time'] = pd.to_numeric(data['Split - 15K - Cumulative time'])
data['Split - 10K - Cumulative time'] = pd.to_numeric(data['Split - 10K - Cumulative time'])
data['Split - 5K - Cumulative time'] = pd.to_numeric(data['Split - 5K - Cumulative time'])




In [None]:
data.info()

**Category Column:**
* Convert Category elements to a sortable format for later analysis (i.e. remove gender prefix).

In [None]:
data['Category'] = data['Category'].str.slice(1)# slice first character (M/F), to enable category sorting of age category.
data['Category'].replace({'BC': 'Unknown'}, inplace=True) # preprocess 
data = data[data['Category'] != 'Unknown'] # drop Unknown 

In [None]:
data.groupby(['Gender','Category']).size()

**Gender Column:**
* Convert Gender elements to a readable format (from m and f to Male and Female).
Remove entries that do not specify the participant’s gender.

In [None]:
data["Gender"].replace(to_replace = "m",value="Male", inplace = True) # replace elemments in Gender column
data["Gender"].replace(to_replace = "f",value="Female", inplace = True) # replace elemments in Gender column
data["Gender"].replace(to_replace = 0,value="Not Specifed", inplace = True) # replace elemments in Gender column
data = data[data['Gender'] != 'Not Specifed'] # drop not specifed 

In [None]:
data.groupby(['Gender','Category']).size()

**Average Speed Columns:**

* Round the average speed for each participant to 2 decimal places.

In [None]:
data['Avg speed'] = data['Avg speed'].round(2) # round average speed


### Exercises

1. Find the percentages of female and male runners.
2. Find the number of female and male runners in each category. Create a barplot.
3. Plot the histograms of finish time for male and female runners.
4. Plot the histograms of split times (5K, 10K, 15K, 20K). Can you find any difference?
5. Create a boxplot of finish time for each category and gender.
6. Repeat the previous task for the splits. Can you find any difference?

---
7. Use scatter plots to find relationships between different data (for example 5K time and finish time).
8. Compute Pearson's correlations.

---

9. Create univariate linear model(s) to predict the finish time.
10. Create multivariate linear model to predict the finish time.
11. Can you find any relationship between
 * age and predictability of finish time,
 * gender and predictability of finish time,
 * professionality of the runner predictability of finish time?
 
--- 
11. Create a univariate logistic regression model to predict the gender of a runner.
12. Create a multivariate logistic regression model to predict the gender of a runner.
13. Can you find any relationship between the predictability of the gender and subsets of other variables?
 


In [None]:
data.groupby('Gender').size()