In [1]:
# Data Wrangling
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# Data Transformation
from sklearn.impute import SimpleImputer

In [2]:
# URL of the CSV file
url = 'https://raw.githubusercontent.com/redbackoperations/Projects/main/Sports%20Performance%20Analysis/frontend/Cycling%20Analysis/data/extended_activities.csv'

# Read the CSV file directly from the URL
df = pd.read_csv(url)

# Display the first 10 rows
df.head()

Unnamed: 0,Activity ID,Activity Date,Activity Name,Activity Type,Elapsed Time,Distance,Max Heart Rate,Relative Effort,Commute,Activity Gear,...,Maximum Power 10s,Maximum Power 30s,Maximum Power 1.0min,Maximum Power 5.0min,Maximum Power 10.0min,Maximum Power 20.0min,Maximum Power 30.0min,Maximum Power 1.0 hr,Maximum Power 1.5 hr,Maximum Power 2.0 hr
0,2929442069,"13 Dec 2019, 01:46:07",Lunch Ride,Ride,3859,22.97,139.0,11.0,False,,...,412.9,342.1,221.2,162.2,136.3,120.8,109.8,105.4,0.0,0.0
1,2945780637,"20 Dec 2019, 23:05:01",Morning Ride,Ride,4852,29.65,133.0,12.0,False,,...,342.1,236.1,184.3,150.6,131.4,122.1,120.3,115.6,0.0,0.0
2,2948028275,"21 Dec 2019, 23:25:29",Morning Ride,Ride,5817,32.38,139.0,19.0,False,,...,390.5,265.1,213.3,146.2,135.1,127.5,125.5,110.2,104.7,0.0
3,2952462113,"24 Dec 2019, 01:19:17",Lunch Ride,Ride,3851,21.68,140.0,11.0,False,,...,452.0,303.1,262.0,161.9,146.7,139.7,134.0,112.3,0.0,0.0
4,2956494096,"26 Dec 2019, 00:09:08",Lunch Ride,Ride,5843,32.36,131.0,14.0,False,,...,342.9,243.9,208.3,134.7,124.0,119.4,115.5,99.8,96.9,0.0


In [4]:
# Isolate the running data component
df_ride = df[df['Activity Type'] == 'Ride']

df_ride.head()

Unnamed: 0,Activity ID,Activity Date,Activity Name,Activity Type,Elapsed Time,Distance,Max Heart Rate,Relative Effort,Commute,Activity Gear,...,Maximum Power 10s,Maximum Power 30s,Maximum Power 1.0min,Maximum Power 5.0min,Maximum Power 10.0min,Maximum Power 20.0min,Maximum Power 30.0min,Maximum Power 1.0 hr,Maximum Power 1.5 hr,Maximum Power 2.0 hr
0,2929442069,"13 Dec 2019, 01:46:07",Lunch Ride,Ride,3859,22.97,139.0,11.0,False,,...,412.9,342.1,221.2,162.2,136.3,120.8,109.8,105.4,0.0,0.0
1,2945780637,"20 Dec 2019, 23:05:01",Morning Ride,Ride,4852,29.65,133.0,12.0,False,,...,342.1,236.1,184.3,150.6,131.4,122.1,120.3,115.6,0.0,0.0
2,2948028275,"21 Dec 2019, 23:25:29",Morning Ride,Ride,5817,32.38,139.0,19.0,False,,...,390.5,265.1,213.3,146.2,135.1,127.5,125.5,110.2,104.7,0.0
3,2952462113,"24 Dec 2019, 01:19:17",Lunch Ride,Ride,3851,21.68,140.0,11.0,False,,...,452.0,303.1,262.0,161.9,146.7,139.7,134.0,112.3,0.0,0.0
4,2956494096,"26 Dec 2019, 00:09:08",Lunch Ride,Ride,5843,32.36,131.0,14.0,False,,...,342.9,243.9,208.3,134.7,124.0,119.4,115.5,99.8,96.9,0.0


In [5]:
df_ride.info()

<class 'pandas.core.frame.DataFrame'>
Index: 181 entries, 0 to 343
Data columns (total 49 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Activity ID                181 non-null    int64  
 1   Activity Date              181 non-null    object 
 2   Activity Name              181 non-null    object 
 3   Activity Type              181 non-null    object 
 4   Elapsed Time               181 non-null    int64  
 5   Distance                   181 non-null    float64
 6   Max Heart Rate             181 non-null    float64
 7   Relative Effort            181 non-null    float64
 8   Commute                    181 non-null    bool   
 9   Activity Gear              162 non-null    object 
 10  Filename                   181 non-null    object 
 11  Athlete Weight             70 non-null     float64
 12  Bike Weight                162 non-null    float64
 13  Moving Time                181 non-null    float64
 14 

In [6]:
# Sum the total rows of missing values from each attribute
missing_values = df_ride.isnull().sum()

# Return only attrbutes with missing values
missing_values[missing_values > 0]

Activity Gear                 19
Athlete Weight               111
Bike Weight                   19
Elevation Gain                 1
Elevation Loss                 6
Elevation Low                  6
Elevation High                 6
Average Temperature            6
Total Work                     1
Perceived Exertion           180
Prefer Perceived Exertion    179
Perceived Relative Effort    180
Grade Adjusted Distance      181
Average Elapsed Speed         89
Dirt Distance                 89
Total Steps                  181
dtype: int64