## CLEANING TRAINING DATA 🏊 🚴‍♂️ 🏃‍♀️‍➡️

In [1]:
# Setup 

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.options.display.max_rows = 6000
pd.options.display.max_columns = 6000

# Read Data and Create Data Frames
df_activs_raw = pd.read_csv('./data/activities.csv')
df_injuries = pd.read_csv('./data/injuries & sicknesses.csv')
df_max_hr_limit = pd.read_csv('../TRI_250105_Whoop-Data_Analysis/whoop data/max heart rate limit per day.csv')
df_exert = pd.read_csv('./data/perceived exertion 2024.csv')

## Functions

In [2]:
###### FUNCTIONS GO HERE ######

def null_count (dataframe):
    ''' 
    This function shows for an inputted dataframe a table of all the null values per columns
    input: your dataframe
    output: dataframe holding the null values count per column of your dataframe
    '''
    null_list = dataframe.isnull().sum()
    # Transforming the list into a dataframe 
    dataframe_nulls = pd.DataFrame(null_list)
    dataframe_nulls.columns = ['Null Count']
    # Filtering: only show columns where null values exist
    dataframe_nulls_only = dataframe_nulls[dataframe_nulls["Null Count"] > 0]
    return dataframe_nulls_only


# Cleaning `df_exert`

In [3]:
df_exert ["date"] = pd.to_datetime(df_exert["date"], format="mixed",  dayfirst = True)

# Cleaning `df_max_hr_limit_raw`

In [4]:
df_max_hr_limit ["date"] = pd.to_datetime(df_max_hr_limit["date"], format="mixed",  dayfirst = True)

# Cleaning `df_injuries`

In [5]:
# make columns lowercase
df_injuries.columns = [x.lower() for x in df_injuries.columns]
# Change data type of time columns
df_injuries["date"] = pd.to_datetime(df_injuries["date"], format="mixed",  dayfirst = True)

# Cleaning `df_activs_raw`

#### Preparing & Cleaning Columns

In [6]:
#1) Make column names lowercase
df_activs_raw.columns = [x.lower() for x in df_activs_raw.columns]

In [7]:
#2a)Dropping unneccesary columns
df_activs_raw = df_activs_raw.drop([
"commute",
"activity private note",
"activity gear",
"athlete weight",
"bike weight",
"elapsed time.1",
"distance",     # which should I drop distance.1 or distance, I think distance.1 also has values for indoor swimming, the other one not
"relative effort.1",
"total work",
"number of runs",
"uphill time",
"downhill time",
"other time",
"max heart rate",
"type",
"start time",
"prefer perceived exertion",
"perceived exertion", # deleting this and later recreating it with data from different dataset
"commute.1",
"total weight lifted",
"from upload",
"bike",
"gear",
"jump count",
"total grit",
"average flow",
"flagged",
"dirt distance",
"newly explored distance",
"newly explored dirt distance",
"activity count",
"weighted average power", 
"power count",
"total steps",
"max watts",
"carbon saved",
"training load",
"intensity",
"timer time",
"total cycles",
"media",
], axis=1)

#2b)Dropping weather-related columns
df_activs_raw = df_activs_raw.drop([
"weather observation time",
"weather condition",
"weather temperature",
"apparent temperature",
"dewpoint",
"humidity",
"weather pressure",
"wind speed",
"wind gust",
"wind bearing",
"precipitation intensity",
"precipitation probability",
"precipitation probability",         
"precipitation type", "cloud cover",                    
"weather visibility",                
"uv index",                        
"weather ozone", 
"sunrise time",
"sunset time",
"moon phase",
], axis=1)

In [8]:
#2b) Rename columns
df_activs_raw = df_activs_raw.rename({"max heart rate.1": "max heart rate"}, axis=1)
df_activs_raw = df_activs_raw.rename({"average speed": "avg speed mps"}, axis=1) # renaming the columns with meters per second to later better identify them and then reformat them to km/h
df_activs_raw = df_activs_raw.rename({"max speed": "max speed mps"}, axis=1)         # renaming the columns with meters per second to later better identify them and then reformat them to km/h
df_activs_raw = df_activs_raw.rename({"average elapsed speed": "avg elapsed speed mps"}, axis=1)  # renaming the columns with meters per second to later better identify them and then reformat them to km/h
df_activs_raw = df_activs_raw.rename({"distance.1": "distance"}, axis=1)

# abbreviating some of the columns to make it easier to work with them
df_activs_raw = df_activs_raw.rename({"average grade": "avg grade"}, axis=1)
df_activs_raw = df_activs_raw.rename({"average positive grade": "avg positive grade"}, axis=1)
df_activs_raw = df_activs_raw.rename({"average negative grade": "avg negative grade"}, axis=1)
df_activs_raw = df_activs_raw.rename({"average cadence": "avg cadence"}, axis=1)
df_activs_raw = df_activs_raw.rename({"average heart rate": "avg heart rate"}, axis=1)
df_activs_raw = df_activs_raw.rename({"average watts": "avg watts"}, axis=1)
df_activs_raw = df_activs_raw.rename({"average temperature": "avg temperature"}, axis=1)
df_activs_raw = df_activs_raw.rename({"average grade adjusted pace": "avg grade adjusted pace"}, axis=1)
df_activs_raw = df_activs_raw.rename({"weighted average power": "weighted avg power"}, axis=1)

In [9]:
#3a) Change data type of time columns
df_activs_raw["activity date"] = pd.to_datetime(df_activs_raw["activity date"], format="mixed",  dayfirst = True)
df_activs_raw["elapsed time"]= pd.to_timedelta(df_activs_raw["elapsed time"], unit='s')
df_activs_raw["moving time"]= pd.to_timedelta(df_activs_raw["moving time"], unit='s')

# Change data type of distance
df_activs_raw["distance"] = pd.to_numeric(df_activs_raw["distance"], errors="coerce")

In [10]:
#3b) Changing the format of speed columns from meter per second to kilometres per hour
df_activs_raw["avg speed"] = df_activs_raw["avg speed mps"] * (18/5)
df_activs_raw["max speed"] = df_activs_raw["max speed mps"] * (18/5)
df_activs_raw["avg elapsed speed"] = df_activs_raw["avg elapsed speed mps"] * (18/5)


# Dropping the columns with meter per second (the new columns with kmh values stay)
df_activs_raw = df_activs_raw.drop(["avg speed mps", "max speed mps", "avg elapsed speed mps"], axis=1)

In [11]:
#3c) Changing the  format of the distance.1 column, which holds more and better values than the other "distance" column, to kilometres
df_activs_raw["distance"] = df_activs_raw["distance"] / 1000

#### Creating New Columns

In [12]:
#4) 
# Split columns and create new columns
df_activs_raw['date'] = pd.to_datetime(df_activs_raw['activity date'], format="mixed").dt.date.astype("datetime64[ns]")
#df_activs_raw['start time'] = pd.to_datetime(df_activs_raw['activity date'], format="mixed").dt.time.astype("datetime64[ns]")     # Uncomment this if you want finish time a clock time but then it becomes an object
df_activs_raw['start time'] = pd.to_datetime(df_activs_raw['activity date'], format="mixed").dt.time
df_activs_raw['start time'] = pd.to_timedelta(df_activs_raw['start time'].astype(str))

# create new column for finish time 
df_activs_raw['finish time'] = df_activs_raw['activity date'] + df_activs_raw['elapsed time']  
df_activs_raw['finish time'] = pd.to_datetime(df_activs_raw['finish time'], format="mixed").dt.time
df_activs_raw['finish time'] = pd.to_timedelta(df_activs_raw['finish time'].astype(str))

#df_activs_raw['finish time'] = pd.to_datetime(df_activs_raw['finish time'], format="mixed").dt.time  # Uncomment this if you want finish time a clock time but then it becomes an object

# create a column for sport, which should hold swim, bike, run instead of the activity type column which holds swim, ride, virtual ride, run
df_activs_raw['sport'] = df_activs_raw['activity type']

df_activs_raw['sport'] =  df_activs_raw['sport'].replace('Virtual Ride', 'Bike')
df_activs_raw['sport'] =  df_activs_raw['sport'].replace('Ride', 'Bike')

# create a column for the zone that the average heart rate during the activity was in    

# Note: These HR Zones are based on a max hr of 191, which is the avg max heart rate over my 6 years of training. 
# The zone calculation is therefore an approximation. I would be better to calculate the zones based on my daily or at least yearly max heart rate. 
def get_hr_zone(hr):
    if hr < 94:        
        return 'below zones' 
    elif 130 <= hr < 140:
        return 'z1'                 
    elif 141 <= hr < 154:
        return 'z2'                 
    elif 155 <= hr < 167:
        return 'z3'                 
    elif 168 <= hr < 179:
        return 'z4'                 
    elif hr > 180:
        return 'z5'
    else:
        return 'no hr collected'

df_activs_raw['avg heart rate zone'] = df_activs_raw['avg heart rate'].apply(get_hr_zone)

# create a column for the training mode, the activity was in. Either high intensity (Z3,4,5) or low intensity. We will fill the column later with values
df_activs_raw['training mode'] = None

#### Dropping rows

In [13]:
#5) Check for missing values (use the function)
null_count(df_activs_raw)

Unnamed: 0,Null Count
activity description,1369
relative effort,955
filename,85
distance,3
elevation gain,126
elevation loss,576
elevation low,546
elevation high,546
max grade,85
avg positive grade,2051


In [14]:
#6a) drop all entries that are entered by WHOOP (these are basically duplicates. Activities that have been recorded on Whoop and Garmin)

df_activs_raw = df_activs_raw[~df_activs_raw["activity name"].str.contains("WHOOP Cycling", case=False, na=False)]
df_activs_raw = df_activs_raw[~df_activs_raw["activity name"].str.contains("WHOOP Running", case=False, na=False)]
df_activs_raw = df_activs_raw[~df_activs_raw["activity name"].str.contains("WHOOP Swimming", case=False, na=False)]

#6b)also deleting all other WHoop-Activties (which can also be duplicates of other Strava activites but with less information)
df_activs_raw = df_activs_raw[~df_activs_raw["activity name"].str.contains("WHOOP", case=False, na=False)]

print (f"Number of remaining rows left: {len(df_activs_raw)}")

Number of remaining rows left: 1992


In [15]:
#6c) drop all entries that are hikes, crossfit and general workouts
df_activs_raw = df_activs_raw.drop(df_activs_raw[df_activs_raw["activity type"]== "Hike"].index)
df_activs_raw = df_activs_raw.drop(df_activs_raw[df_activs_raw["activity type"]== "Crossfit"].index)
df_activs_raw = df_activs_raw.drop(df_activs_raw[df_activs_raw["activity type"]== "Workout"].index)

print (f"Number of remaining rows left: {len(df_activs_raw)}")

Number of remaining rows left: 1756


#### Dropping Duplicates

In [16]:
#7a) Check duplicates (CLASSIC VERSION)
df_activs_raw.duplicated().sum()

0

In [17]:
df_activs_raw.head()

Unnamed: 0,activity id,activity date,activity name,activity type,activity description,elapsed time,relative effort,filename,moving time,distance,elevation gain,elevation loss,elevation low,elevation high,max grade,avg grade,avg positive grade,avg negative grade,max cadence,avg cadence,max heart rate,avg heart rate,avg watts,calories,max temperature,avg temperature,perceived relative effort,grade adjusted distance,pool length,avg grade adjusted pace,avg speed,max speed,avg elapsed speed,date,start time,finish time,sport,avg heart rate zone,training mode
0,1913005302,2018-10-18 16:53:13,Fahrt zum Schwimmen,Ride,,0 days 02:10:14,,activities/1913005302.gpx,0 days 00:46:39,13.4741,55.247833,57.247799,0.0,17.0,16.200001,-0.014843,,,,,,,,,,,,,,,,42.480001,,2018-10-18,0 days 16:53:13,0 days 19:03:27,Bike,no hr collected,
1,1915039748,2018-10-19 17:00:53,Spazierfahrt,Ride,,0 days 01:17:56,,activities/1915039748.gpx,0 days 00:10:38,2.5775,10.25423,,4.5,12.3,5.3,0.034918,,,,,,,,,,,,,,,,34.2,,2018-10-19,0 days 17:00:53,0 days 18:18:49,Bike,no hr collected,
2,1918768385,2018-10-21 13:29:33,5km Training,Run,Mit Dorle,0 days 00:36:26,,activities/1918768385.gpx,0 days 00:36:16,5.0624,38.321098,42.121101,5.6,32.900002,6.8,-0.075063,,,,,,,,407.491119,,,,5102.299805,,,,16.2,,2018-10-21,0 days 13:29:33,0 days 14:05:59,Run,no hr collected,
3,1920958723,2018-10-22 15:19:08,20km Training,Ride,,0 days 01:13:03,,activities/1920958723.gpx,0 days 01:02:05,22.481699,129.917618,132.917999,1.2,32.5,12.7,-0.013344,,,,,,,,,,,,,,,,81.0,,2018-10-22,0 days 15:19:08,0 days 16:32:11,Bike,no hr collected,
4,1921011074,2018-10-18 16:56:49,500m Swim,Swim,(50M10B 09:40Mins + 50M10B 10:15 Mins + 50M10B...,0 days 00:09:40,,,0 days 00:09:40,0.5,0.0,,,,,0.0,,,,,,,,,,,,,,,,,,2018-10-18,0 days 16:56:49,0 days 17:06:29,Swim,no hr collected,


#### Merging `df_activs_raw` & `df_injuries` & `df_max_hr_limit` & `df_exert`

In [18]:
# merging with the injuries dataframe

df_activs_raw = df_activs_raw.merge(df_injuries, how='left', on="date")   

# Filling null values with zeros (meaning: no injury or sickness)
df_activs_raw["injury description"] =  df_activs_raw["injury description"].fillna(0)
df_activs_raw["event"] = df_activs_raw["event"].fillna(0)

# merging again with max yearly heart rate data
df_activs_raw = df_activs_raw.merge(df_max_hr_limit, how='inner', on="date") 


# merging again with the dataframe holding perceived exertion (values only for year 2024)
df_activs_raw = df_activs_raw.merge(df_exert, how='left', on="date") 

#### Final Steps


In [19]:
#8) Set index

#Create column with ID of each activity
df_activs_raw.set_index("activity id", inplace=True)

# dropping  one swim activity which makes no sense with the id 7051509222 (wrong entry)
df_activs_raw.drop(index=7051509222, inplace=True)


In [20]:
#9) Describe
df_activs_raw.describe(include="all")

Unnamed: 0,activity date,activity name,activity type,activity description,elapsed time,relative effort,filename,moving time,distance,elevation gain,elevation loss,elevation low,elevation high,max grade,avg grade,avg positive grade,avg negative grade,max cadence,avg cadence,max heart rate,avg heart rate,avg watts,calories,max temperature,avg temperature,perceived relative effort,grade adjusted distance,pool length,avg grade adjusted pace,avg speed,max speed,avg elapsed speed,date,start time,finish time,sport,avg heart rate zone,training mode,event,injury description,max heart rate yearly,perceived exertion
count,1755,1755,1755,458,1755,830.0,1698,1755,1752.0,1655.0,1415.0,1445.0,1445.0,1698.0,1755.0,0.0,0.0,930.0,961.0,388.0,1001.0,454.0,1512.0,0.0,51.0,28.0,363.0,33.0,40.0,1366.0,1698.0,880.0,1755,1755,1755,1755,1755,0.0,1755.0,1755.0,1755.0,211.0
unique,,717,4,421,,,1698,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,7,0.0,7.0,11.0,,
top,,Morning Ride,Ride,Power Workout,,,activities/1913005302.gpx,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Bike,no hr collected,,0.0,0.0,,
freq,,178,710,11,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,912,1113,,1664.0,1664.0,,
mean,2021-08-06 22:52:47.764672256,,,,0 days 01:12:02.067236467,42.926506,,0 days 00:41:48.634757834,12.023686,80.94242,188.675991,27.085052,58.894187,6.353636,0.011376,,,72.244086,59.437172,151.492268,113.042018,120.270578,395.269021,,13.058824,215.107143,7042.919295,35.606061,3.10701,14.403265,27.636735,10.832994,2021-08-06 08:53:20,0 days 13:59:27.764672364,0 days 14:51:48.293447293,,,,,,191.407407,6.781991
min,2018-10-18 16:49:00,,,,0 days 00:00:18,0.0,,0 days 00:00:14,0.0,0.0,-2.9,-14.5,-2.2,0.0,-3.982808,,,20.0,0.0,90.0,0.0,21.836105,2.0,,-7.0,26.0,351.200012,25.0,2.01896,0.0,0.0,0.0,2018-10-18 00:00:00,0 days 00:15:04,0 days 00:25:37,,,,,,189.0,3.0
25%,2019-12-17 12:06:18.500000,,,,0 days 00:14:17.500000,5.0,,0 days 00:12:59.500000,2.0,0.0,1.733335,3.4,11.9,1.304812,-0.004073,,,31.0,28.644117,141.0,109.365715,82.775219,110.0,,8.0,59.25,5018.599854,25.0,2.890249,3.513665,14.9976,3.019986,2019-12-17 00:00:00,0 days 10:31:02,0 days 11:47:10.500000,,,,,,190.0,5.0
50%,2021-07-11 13:34:04,,,,0 days 00:24:57,22.0,,0 days 00:21:52,5.5365,26.959047,78.0,5.4,16.4,4.2,0.0,,,87.0,80.351906,151.0,129.0,121.80183,281.0,,15.0,135.5,5082.100098,25.0,3.118327,11.417393,29.7,6.699661,2021-07-11 00:00:00,0 days 15:46:07,0 days 16:25:40,,,,,,191.0,7.0
75%,2023-03-04 17:59:34.500000,,,,0 days 00:50:36,60.0,,0 days 00:42:35.500000,10.02265,59.0,211.5,10.0,28.0,7.4,0.03494,,,103.0,83.919464,164.0,152.294907,148.638218,463.25,,19.0,198.0,7955.449951,50.0,3.373449,21.816001,39.239999,13.081679,2023-03-04 00:00:00,0 days 17:25:30,0 days 17:57:57,,,,,,193.0,8.0
max,2025-02-11 19:12:17,,,,21 days 02:01:58,673.0,,0 days 10:43:46,183.640859,2660.0,2674.0,1173.599976,2077.199951,49.908257,3.732971,,,172.0,100.789169,188.0,186.734299,313.335968,3927.0,,27.0,1472.0,43289.601562,50.0,3.837681,37.021011,83.879997,36.223287,2025-02-11 00:00:00,0 days 23:23:47,0 days 23:34:30,,,,,,194.0,11.0


In [21]:
#10) Rearrange columns order 

#10a) create new colums for paces (we will fill them in a later step)

df_activs_raw["swim pace"] = None
df_activs_raw["bike pace"] = None
df_activs_raw["run pace"] = None

df_activs_raw = df_activs_raw[[ 'date', 'activity date', 'sport', 'activity type', 'training mode', 'start time', 'moving time', 'elapsed time', 'finish time',  
                             
        'swim pace', 'bike pace', 'run pace',  'distance', 'grade adjusted distance', 'avg grade adjusted pace',
        'avg heart rate', 'avg heart rate zone','max heart rate', 'max heart rate yearly',  'avg speed', 'max speed','avg elapsed speed',
        'relative effort','calories','perceived exertion', 
        'pool length',
    
        'elevation gain', 'elevation loss', 'elevation low', 'elevation high', 
        'max grade', 'avg grade','avg positive grade', 'avg negative grade', 
        
        'avg watts','avg cadence', 'max cadence', 

        'activity name', 'activity description',  
        
        'max temperature', 'avg temperature', 'perceived relative effort', 
        'event',  'injury description', 
      
        'filename' ]]

# Creating Subsets

In [22]:
# 1) Create subsets for each discipline
df_activs_swim_raw =  df_activs_raw[df_activs_raw["activity type"] == "Swim"] 
df_activs_run_raw =  df_activs_raw[df_activs_raw["activity type"] == "Run" ]
df_activs_bike_raw = df_activs_raw[df_activs_raw["activity type"].isin(["Ride", "Virtual Ride"])]

#### Creating a Swim Subset

In [23]:
# SWIM 💧
#2a)Dropping unnecessary columns
df_activs_swim_raw = df_activs_swim_raw.drop([
'grade adjusted distance', 'avg grade adjusted pace', 'elevation gain', 'elevation loss', 
'elevation low', 'elevation high','max grade', 'avg grade', 'avg positive grade','avg negative grade', 
'avg watts', 'avg cadence', 'max cadence','max temperature','avg temperature',
'perceived relative effort', "avg elapsed speed"
], axis=1)

#Create new column for swim pace which is normally measured in time per 100 meters
df_activs_swim_raw["swim pace"] = df_activs_swim_raw["moving time"] / df_activs_swim_raw["distance"] / 10


Solving inherent problems in the swim dataset

In [24]:
# Merging Swim Activities that are actually part of one activity

# PROBLEM: 
# The first years, I did not know that you could interrupt a swim activity on my fitness watch. 
# Therefore, whenever I took during a swim session a short break, I then afterwards started a new activity. 
# this, however, distorts the total amount of swim workouts I had

# SOLUTION: 
# Step 1: Find days with more than one swim activity 
# Step 2: If these swim activities are shortly after each other, they belong to the same workout
# Step 3: Merge (sum & average) the values in the columns together
# Step 4: Create a dataframe with the merged swims (df_swims_singles_raw)
# Step 5: Create dataframe for normal swims 
# Step 6: Combine both swim dataframes


# Step 1: Find days with more than one swim activity 
df_filtered = df_activs_swim_raw[df_activs_swim_raw.groupby("date")["date"].transform("count") > 1]

# Step 2: If these swim activities are shortly after each other, they belong to the same workout 
# checked this manually 

In [25]:
# Step 3) merge all activities except 3121215271 & 3122793348 as well as 2681382064 & 2681382159 🚧 🚧 🚧 🚧 🚧 🚧 🚧 🚧 
# dropping the activities which are indeed individual activities 
df_filtered.drop(index=3121215271, inplace=True)
df_filtered.drop(index=3122793348, inplace=True)
df_filtered.drop(index=2681382064, inplace=True)
df_filtered.drop(index=2681382159, inplace=True)

# creating a dataframe that merges sum of the columns' values in one row per date
df_filtered_merged_1 = df_filtered.groupby("date")[[ "moving time","distance"]].sum().reset_index()  # these are now the actual values of the combined activities

# to also get the missing columns back (all expect date, moving time & distance), we just take the first activity per day
df_filtered_merged_2= df_filtered.drop_duplicates(subset="date", keep="first")
# now we also drop the columns where we have not the aggregated values 
df_filtered_merged_2 = df_filtered_merged_2.drop([
    "moving time","elapsed time","finish time","distance","avg speed", "max speed", "calories", "swim pace",  "activity name", "activity description", "filename"  # swim pace? is that right? 🔴 🔴 🔴 🔴
], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(index=3121215271, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(index=3122793348, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(index=2681382064, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(inde

In [26]:
# Step 4: Create a dataframe with the merged swims (df_swims_singles_raw)

# Merging the two new placeholder dataframes as a substep
df_filtered_merged_2 = df_filtered_merged_2.reset_index()
df_swims_singles_raw = df_filtered_merged_1.merge(df_filtered_merged_2, how='outer')

# reseting the index to activity id
df_swims_singles_raw.set_index("activity id", inplace=True)

# Creating columns that `df_activs_swims_raw` has too but which are still missing here
df_swims_singles_raw["elapsed time"] = None
df_swims_singles_raw["finish time"] = None
df_swims_singles_raw["max speed"] = None
df_swims_singles_raw["calories"] = None
df_swims_singles_raw["activity name"] = None
df_swims_singles_raw["activity description"] = None
df_swims_singles_raw["filename"] = None


df_swims_singles_raw["avg speed"] =  None ### I should be able to calculate that, but skipped it for now 
df_swims_singles_raw["swim pace"] = df_swims_singles_raw["moving time"] / df_swims_singles_raw["distance"] / 10   # again recreating the swim pace for the merged activities

At this point we do have one ready dataframe `df_swims_singles_raw` that holds all the merged swim activities that once where recorded separately even though they where one activity. But no we need to merge them with all the normal swims, that where properly recorded. 

In [27]:
# Step 5: Create dataframe for normal swims 

## create a list of all activity ids of activities that are in the df_filtered 
df_filtered = df_filtered.reset_index()
detached_swims_series = df_filtered["activity id"]

# creating a dataframe that only holds the "normal" swim activities that are properly recorded and not split up in separate activities
df_swims_normals_raw = df_activs_swim_raw
df_swims_normals_raw.drop(index=detached_swims_series, inplace=True, errors="ignore")

In [28]:
# Step 6: Combine both swim dataframes


# Rearranging the order of the column to be able to merge the two dataframes
df_swims_normals_raw = df_swims_normals_raw[[ "date", "activity date", "sport", "activity type", "training mode", "start time", "moving time", "elapsed time", 
        "finish time","swim pace", "distance", "avg heart rate", "avg heart rate zone", "max heart rate", "max heart rate yearly", "avg speed", "max speed", 
        "relative effort", "calories",'perceived exertion', "pool length", "activity name", "activity description",  'event', 'injury description', "filename"
 ]]

df_swims_singles_raw = df_swims_singles_raw[[  "date", "activity date", "sport", "activity type","training mode", "start time", "moving time", "elapsed time", 
        "finish time","swim pace", "distance", "avg heart rate", "avg heart rate zone", "max heart rate", "max heart rate yearly", "avg speed", "max speed", 
        "relative effort", "calories",'perceived exertion', "pool length", "activity name", "activity description",  'event', 'injury description', "filename"
]]

# Finally concatting the two dataframes holding the normal swims and the merged swims
df_activs_swim_raw = pd.concat([df_swims_singles_raw, df_swims_normals_raw], axis=0)

  df_activs_swim_raw = pd.concat([df_swims_singles_raw, df_swims_normals_raw], axis=0)


#### Creating a Bike Subset

In [29]:
# BIKE 🚴‍♀️
# Dropping unnecessary columns
df_activs_bike_raw = df_activs_bike_raw.drop([
'grade adjusted distance','avg grade adjusted pace', 
'pool length','avg positive grade','avg negative grade', 
'max temperature','avg temperature', 'perceived relative effort', 'swim pace', 'run pace'
], axis=1)

# Create new column for pace
df_activs_bike_raw["bike pace"] = df_activs_bike_raw["moving time"]/df_activs_bike_raw["distance"]

dropping `commute` rides to work and in town which were no proper workouts

In [30]:
# Some bike rides are basically commutes in the town and not really proper activities. 

# drop all activities that have anything like "commute" or "commuting" in the title
df_activs_bike_raw = df_activs_bike_raw[~df_activs_bike_raw["activity name"].str.contains("Commut", case=False, na=False)]


# Since my way to work was around 6 km, I assume that all rides shorter than 7km are commutes, and therefore I make them a new dataframe
df_activs_commutes_raw =  df_activs_bike_raw[df_activs_bike_raw["distance"] < 7.0] 

# create a list of IDs of all commute activities 
df_activs_commutes_raw = df_activs_commutes_raw.reset_index()
commute_id_list = df_activs_commutes_raw["activity id"]

# drop these commutes from the cycling dataframe  
df_activs_bike_raw = df_activs_bike_raw.drop(index=commute_id_list, errors="ignore")

dropping duplicate bike workouts

In [31]:
# Check duplicates in the bike subset (SPECIFIC VERSION)

# PROBLEM: 
# Activities are recorded on three different devices: Garmin Sport Watch, Wahoo Bike Computer and (sometimes) on the Whoop fitness tracker. 
# Hence, it could be that the same activity was uploaded twice but via different devices. 

# SOLUTION: 
# Step 1: Logic: Check days that have more than one activity of the same type on one day.
# Step 2: If both of these activities have e.g. a similar distance, they are probably duplicates. 
# Step 3: Drop these

# Since Swims were never recorded on two devices, and runs neither. I only check rides
df_activs_bike_raw_filtered = df_activs_bike_raw[df_activs_bike_raw.groupby("date")["date"].transform("count") > 1]
# I then export this .csv and checked it manually for any inconsistencies and deleted the duplicates directly in the training platform (database) Strava and reimported them 

#### Creating a Run Subset

In [32]:
# RUN 🏃‍♂️
#2c)Dropping unnecessary columns
df_activs_run_raw = df_activs_run_raw.drop([
'pool length','avg positive grade','avg negative grade', 
'avg watts', 'max temperature',
'avg temperature','perceived relative effort', 'bike pace', 'swim pace'
], axis=1)

# Create new column for pace (WORKING, BUT I DONT LIKE THE FORMAT)
df_activs_run_raw["run pace"] = df_activs_run_raw["moving time"]/df_activs_run_raw["distance"]

# Creating and Saving the final Subset Dataframes

In [33]:
df_swims = df_activs_swim_raw
df_bikes = df_activs_bike_raw
df_runs = df_activs_run_raw

In [34]:
# saving the subsets to csv. Uncomment to save 💾 💾 💾

# saving to .csv
'''
df_swims.to_csv('df_swims.csv', index=True)
df_bikes.to_csv('df_bikes.csv', index=True)
df_runs.to_csv('df_runs.csv', index=True)

'''

'\n'

# Creating and Saving the final Dataframes with all Activities

#### Bringing the cleaned swim, bike, run subsets together in a all activity dataset `df_activs`

In [35]:
# The sport-specific datasets do not have all the same columns. So I need to add the missing columns again, or I cant concat the three subsets again together

# Create new empty columns in the swims subset
df_activs_swim_raw["elevation gain"] = None
df_activs_swim_raw["elevation loss"] = None
df_activs_swim_raw["elevation low"] = None
df_activs_swim_raw["elevation high"] = None
df_activs_swim_raw["max grade"] = None
df_activs_swim_raw["avg grade"] = None
df_activs_swim_raw["avg watts"] = None
df_activs_swim_raw["avg cadence"] = None
df_activs_swim_raw["max cadence"] = None
df_activs_swim_raw["avg elapsed speed"] = None
#df_activs_swim_raw["perceived exertion"] = None     # 🚨 you might want to delete this here, once you entered it above newly
#df_activs_swim_raw["avg heart rate zone"] = None        # 🚨 you might want to delete this here, once you entered it above newly
df_activs_swim_raw["bike pace"] = None 
df_activs_swim_raw["run pace"] = None 

# Create new empty columns in the bike subset
df_activs_bike_raw["pool length"] = None
#df_activs_bike_raw["perceived exertion"] = None     # 🚨 you might want to delete this here, once you entered it above newly
#df_activs_bike_raw["avg heart rate zone"] = None        # 🚨 you might want to delete this here, once you entered it above newly
df_activs_bike_raw["run pace"] = None 
df_activs_bike_raw["swim pace"] = None 

# Create new empty columns in the run subset
df_activs_run_raw["pool length"] = None
df_activs_run_raw["avg watts"] = None
#df_activs_run_raw["perceived exertion"] = None     # 🚨 you might want to delete this here, once you entered it above newly
#df_activs_run_raw["avg heart rate zone"] = None        # 🚨 you might want to delete this here, once you entered it above newly
df_activs_run_raw["swim pace"] = None 
df_activs_run_raw["bike pace"] = None 

# Rearringing the order of columns in the three subsets
df_activs_swim_raw = df_activs_swim_raw[[ 'date', 'activity date', 'sport', 'activity type', 'training mode', 'start time', 'moving time', 'elapsed time', 'finish time',                     
        'swim pace', 'bike pace', 'run pace', 'distance', 'avg heart rate', 'avg heart rate zone', 'max heart rate',  "max heart rate yearly",  'avg speed', 'max speed','avg elapsed speed',
        'relative effort','calories',
        'pool length',
        'elevation gain', 'elevation loss', 'elevation low', 'elevation high', 
        'max grade', 'avg grade',
        'avg watts', 'avg cadence', 'max cadence', 
        'activity name', 'activity description', 'perceived exertion', 'event',  'injury description', 
        'filename' ]]

df_activs_bike_raw = df_activs_bike_raw[[ 'date', 'activity date', 'sport', 'activity type', 'training mode', 'start time', 'moving time', 'elapsed time', 'finish time',                     
         'swim pace', 'bike pace', 'run pace','distance', 'avg heart rate', 'avg heart rate zone', 'max heart rate', "max heart rate yearly",  'avg speed', 'max speed','avg elapsed speed',
        'relative effort','calories',
        'pool length',
        'elevation gain', 'elevation loss', 'elevation low', 'elevation high', 
        'max grade', 'avg grade',
        'avg watts', 'avg cadence', 'max cadence', 
        'activity name', 'activity description', 'perceived exertion', 'event',  'injury description', 
        'filename' ]]

df_activs_run_raw = df_activs_run_raw[[ 'date', 'activity date', 'sport', 'activity type', 'training mode', 'start time', 'moving time', 'elapsed time', 'finish time',                     
       'swim pace', 'bike pace', 'run pace', 'distance',  'avg heart rate', 'avg heart rate zone','max heart rate',  "max heart rate yearly", 'avg speed', 'max speed','avg elapsed speed',
        'relative effort','calories',
        'pool length',
        'elevation gain', 'elevation loss', 'elevation low', 'elevation high', 
        'max grade', 'avg grade',
        'avg watts', 'avg cadence', 'max cadence', 
        'activity name', 'activity description', 'perceived exertion','event',  'injury description', 
        'filename' ]]


In [36]:
# Finally concatting the three dataframes into one set
df_activs_concat1 = pd.concat([df_activs_swim_raw, df_activs_bike_raw], axis=0)
df_activs_concats = pd.concat([df_activs_concat1, df_activs_run_raw], axis=0)

  df_activs_concat1 = pd.concat([df_activs_swim_raw, df_activs_bike_raw], axis=0)
  df_activs_concats = pd.concat([df_activs_concat1, df_activs_run_raw], axis=0)
  df_activs_concats = pd.concat([df_activs_concat1, df_activs_run_raw], axis=0)


#### Saving Subsets to .csv

In [37]:
# creating final dataframes

df_activs = df_activs_concats

# And saving it. Uncomment to save 💾 💾 💾
'''
df_activs.to_csv('df_activs.csv', index=True)  

'''

'\n'