# Creating HR Zone Data from Single Activities (high/low intensity runs)

In [1]:
# Setup 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.options.display.max_rows = 6000
pd.options.display.max_columns = 6000

# Read Data and Create Data Frames
df_all_runs = pd.read_csv('./df_all_runs.csv')
df_max_hr_limit = pd.read_csv('../TRI_250105_Whoop-Data_Analysis/whoop data/max heart rate limit per day.csv')
df_runs = pd.read_csv('../data/df_runs.csv')

  df_all_runs = pd.read_csv('./df_all_runs.csv')


## Cleaning `df_runs`

In [2]:
# Reformat the filename column so that it gets easier to import the files in the next steps
df_runs["filename"] = df_runs["filename"].str.extract(r'(\d+)')[0]  #+ ".fit"

In [3]:
# create list of filenames
file_list = df_runs["filename"] 

## Labelling activities as `high intensity` or `low intensity` 

Creating a process that transforms and then saves heart rate zone data from each individual activity into a new dataframe which can then later be merged with `df_activs`, which holds all the high-level data from all activities

⚠️ In an other notebook we created a function that transformed all .fit activities in to one single .csv file (`df_all_runs`). We use this file in the following steps to label each activity.

## Preparation

1. Read in the file
2. cleaning each of the individual activities

In [4]:
# Read in the file
df_all_runs = pd.read_csv('./df_all_runs.csv')

  df_all_runs = pd.read_csv('./df_all_runs.csv')


In [5]:
# dropping unnecessary columns
df_all_runs = df_all_runs.drop([
"accumulated_power",
"altitude",
"cadence",
"enhanced_altitude",
"enhanced_speed",
"fractional_cadence",
"position_lat",
"position_long",
"power",
"speed",
"stance_time",
"stance_time_balance",
"stance_time_percent",
"step_length",
"unknown_107",
"unknown_134",
"unknown_135",
"unknown_136",
"unknown_137",
"unknown_138",
"unknown_140",
"unknown_143",
"unknown_87",
"unknown_88",
"unknown_90",
"vertical_oscillation",
"vertical_ratio"
], axis=1)

# renaming columns
df_all_runs = df_all_runs.rename({"source_file": "filename"}, axis=1)
df_all_runs = df_all_runs.rename({"activity_type": "sport"}, axis=1)
df_all_runs = df_all_runs.rename({"heart_rate": "heart rate"}, axis=1)

# change datatype
df_all_runs["timestamp"]= pd.to_datetime(df_all_runs["timestamp"])


In [6]:
df_all_runs["year"] = df_all_runs["timestamp"].dt.year

## Transforming zones per minutes

1. Create Heart Rate Zones
2. Aggregate average hr per (moving) minute
3. Label each minute with zone (z1,z2,z3,z4,z5)

In [7]:
# Create Heart Rate Zones

# Note: These HR Zones are based on a max hr of 191, which is the avg max heart rate over my 6 years of training. 
# The zone calculation is therefore an approximation. I would be better to calculate the zones based on my daily or at least yearly max heart rate. 
def assign_zone(hr):
    if hr < 94:        
        return 'below zones' 
    elif 130 <= hr < 140:
        return 'z1'                 
    elif 141 <= hr < 154:
        return 'z2'                 
    elif 155 <= hr < 167:
        return 'z3'                 
    elif 168 <= hr < 179:
        return 'z4'                 
    elif hr > 180:
        return 'z5'
    else:
        return 'no hr collected'


In [8]:
#### Aggregate average hr per (moving) minute

In [9]:
def aggregate_activity_by_minute(activity_df):
    # Ensure the 'timestamp' column is in datetime format
    activity_df['timestamp'] = pd.to_datetime(activity_df['timestamp'], errors='coerce')

    # Floor the timestamp to the minute (removing seconds)
    activity_df['minute'] = activity_df['timestamp'].dt.floor('min')

    # Group by the floored minute and compute the average heart rate for each minute
    agg_df = activity_df.groupby('minute')['heart rate'].mean().reset_index()
    agg_df.rename(columns={'heart rate': 'avg_heart_rate'}, inplace=True)

    # Assign a zone label to each minute based on the average heart rate
    agg_df['zone'] = agg_df['avg_heart_rate'].apply(assign_zone)

    return agg_df

In [10]:
#### Label each minute with zone (1,2,3,4,5)

In [11]:
df_all_runs.head(0)

Unnamed: 0,filename,filenumber,sport,distance,heart rate,timestamp,year


In [12]:

# Create an empty DataFrame to store the per-minute aggregated results for all activities
df_all_runs_min_zones = pd.DataFrame(columns=['filenumber', 'minute', 'avg_heart_rate', 'zone'])


# Group the data by 'filenumber' (each unique activity) and process each one
for filenumber, group in df_all_runs.groupby('filenumber'):
    
    # Aggregate the group by minute
    agg_activity = aggregate_activity_by_minute(group)
    # Add the filenumber column to the aggregated data
    agg_activity['filenumber'] = filenumber
    # Append to the empty DataFrame
    df_all_runs_min_zones = pd.concat([df_all_runs_min_zones, agg_activity], ignore_index=True)

# Reorder columns 
df_all_runs_min_zones = df_all_runs_min_zones[['filenumber', 'minute', 'avg_heart_rate', 'zone']]


  df_all_runs_min_zones = pd.concat([df_all_runs_min_zones, agg_activity], ignore_index=True)
  df_all_runs_min_zones = pd.concat([df_all_runs_min_zones, agg_activity], ignore_index=True)


## Aggregating zones per minutes

How many minutes spend in which zone per activity in `df_runs_mins_zones`

1. count minutes spend per zone per activity
2. count minutes spend in low zones (z1, z2) per activity
3. count minutes spend in high zones (z3,z4,z5) per activity
4. create label for activity ["training mode"]: >30% time in high = high intensity

##### 1. count minutes spend per zone per activity

In [13]:
# For each activity (filenumber) and minute, assume that the zone is constant.
# We'll take the first zone value within each minute.
minute_zone = df_all_runs_min_zones.groupby(['filenumber', 'minute' ])['zone'].first().reset_index()

# Now, group by filenumber and zone and count the number of minutes
zone_counts = minute_zone.groupby(['filenumber', 'zone']).size().reset_index(name='minutes')

# Optional: Pivot the data so that each activity is a row and each zone is a column
zone_counts_pivot = zone_counts.pivot(index='filenumber', columns='zone', values='minutes').reset_index()

# Replace NaN with 0 (if an activity didn't have any minutes in a particular zone)
df_runs_min_zones = zone_counts_pivot.fillna(0)

df_runs_min_zones.tail(2)

zone,filenumber,below zones,no hr collected,z1,z2,z3,z4,z5
339,13930619274,1.0,9.0,16.0,46.0,0.0,0.0,0.0
340,14100622155,0.0,5.0,3.0,12.0,13.0,0.0,0.0


##### Count minutes spend in low zones (z1 & z2)

In [14]:
df_runs_min_zones["time low zones"] = df_runs_min_zones["z1"] + df_runs_min_zones["z2"]

##### Count minutes spend in high zones (z3,4 & z5)

In [15]:
df_runs_min_zones["time high zones"] = df_runs_min_zones["z3"] + df_runs_min_zones["z4"] + df_runs_min_zones["z5"]

##### Labelling activities high or low intensity

In [16]:
#  create label for activity ["activity zone"]: >50% time in low = low intensity

# create a list of our conditions
conditions = [
    ((df_runs_min_zones['time high zones'] / (df_runs_min_zones["time high zones"] + df_runs_min_zones["time low zones"] )) >= 0.3    ),
    ((df_runs_min_zones['time low zones'] / (df_runs_min_zones["time high zones"] + df_runs_min_zones["time low zones"] )) > 0.7    )
    ]

# create a list of the values we want to assign for each condition
values = ['high int', 'low int']

# create a new column and use np.select to assign values to it using our lists as arguments
df_runs_min_zones["training mode"] = np.select(conditions, values)


##### saving `df_runs_min_zones`as .csv

In [17]:

df_runs_min_zones.to_csv('../data/df_runs_min_zones.csv', index=False)
'''
'''

'\n'

## Aggregating zones per distance

How much distance spend in which zone per activity (in 100m) in `df_runs_dist_zones`

1. aggregate average hr per 100 meter 
2. label each 100m with zone (1,2,3,4,5)
3. count distance spend per zone
4. combine z3z4

##### aggregate average hr per 100 meter 

In [18]:
def aggregate_activity_by_100m(df_all_runs):
    
    # Create a bucket column: each bucket represents 100m segments.
    # For example, if distance is 251.68, then floor(251.68/100)*100 = 200.
    df_all_runs['distance 100m'] = (df_all_runs['distance'] // 100) * 100
    
    # Group by the bucket and compute the average heart rate in each segment.
    agg_df = df_all_runs.groupby('distance 100m')['heart rate'].mean().reset_index()
    agg_df.rename(columns={'heart rate': 'heart rate 100m'}, inplace=True)
   
    # Assign zones based on the average heart rate.
    agg_df['zone'] = agg_df['heart rate 100m'].apply(assign_zone)
    
    return agg_df

#### label each 100m with zone (z1,z2,z3,z4,z5)

In [19]:
# Create an empty DataFrame to store the aggregated results for all activities.
df_runs_dist_zones = pd.DataFrame(columns=['filenumber', 'distance 100m', 'heart rate 100m', 'zone'])

# Loop over each activity group 'filenumber' distinguishes activities)
for filenumber, group in df_all_runs.groupby('filenumber'):
    agg_activity = aggregate_activity_by_100m(group)
    agg_activity['filenumber'] = filenumber
    df_runs_dist_zones = pd.concat([df_runs_dist_zones, agg_activity], ignore_index=True)
df_runs_dist_zones.tail(100)

df_runs_dist_zones["distance 100m"] = df_runs_dist_zones["distance 100m"]/100

  df_runs_dist_zones = pd.concat([df_runs_dist_zones, agg_activity], ignore_index=True)
  df_runs_dist_zones = pd.concat([df_runs_dist_zones, agg_activity], ignore_index=True)


##### Count distance spend per zone

In [20]:
# For each activity (filenumber) and minute, assume that the zone is constant.
# We'll take the first zone value within each minute.
distance_zone = df_runs_dist_zones.groupby(['filenumber', 'distance 100m' ])['zone'].first().reset_index()

# Now, group by filenumber and zone and count the number of minutes
zone_counts = distance_zone.groupby(['filenumber', 'zone']).size().reset_index(name='distance 100m')

# Optional: Pivot the data so that each activity is a row and each zone is a column
zone_counts_pivot = zone_counts.pivot(index='filenumber', columns='zone', values='distance 100m').reset_index()

# Replace NaN with 0 (if an activity didn't have any minutes in a particular zone)
df_runs_dist_zones = zone_counts_pivot.fillna(0)

df_runs_dist_zones.tail(1)



zone,filenumber,below zones,no hr collected,z1,z2,z3,z4,z5
340,14100622155,0.0,6.0,5.0,18.0,22.0,0.0,0.0


In [21]:
# renaming the distance columns in order to be able to merge them later with other dataframes and to avoid confusion
df_runs_dist_zones = df_runs_dist_zones.rename({"z1": "dist z1"}, axis=1)
df_runs_dist_zones = df_runs_dist_zones.rename({"z2": "dist z2"}, axis=1)
df_runs_dist_zones = df_runs_dist_zones.rename({"z3": "dist z3"}, axis=1)
df_runs_dist_zones = df_runs_dist_zones.rename({"z4": "dist z4"}, axis=1)
df_runs_dist_zones = df_runs_dist_zones.rename({"z5": "dist z5"}, axis=1)
df_runs_dist_zones = df_runs_dist_zones.rename({"below zones": "dist below zones"}, axis=1)
df_runs_dist_zones = df_runs_dist_zones.rename({"no hr collected": "dist no hr collected"}, axis=1)

In [22]:
# formatting the distance spent in zone columns
df_runs_dist_zones["dist z1"] = df_runs_dist_zones["dist z1"]/10
df_runs_dist_zones["dist z2"] = df_runs_dist_zones["dist z2"]/10
df_runs_dist_zones["dist z3"] = df_runs_dist_zones["dist z3"]/10
df_runs_dist_zones["dist z4"] = df_runs_dist_zones["dist z4"]/10
df_runs_dist_zones["dist z5"] = df_runs_dist_zones["dist z5"]/10
df_runs_dist_zones["dist below zones"] = df_runs_dist_zones["dist below zones"]/10
df_runs_dist_zones["dist no hr collected"] = df_runs_dist_zones["dist no hr collected"]/10


##### saving `df_runs_min_zones`as .csv  

In [24]:
df_runs_dist_zones.to_csv('../data/df_runs_dist_zones.csv', index=False)
'''
'''

'\n'

#### Final steps

In [25]:
# Checking if all three dataframes have the same amount of rows/activities. Since they are, we have all the activities in them. 

print("––––– The original file df_runs contains all running activities. 38 activities, however, where not properly recorded on a fitness watch and have therefore no actual detailed data –––––" )

print(f"All runs ever recorded: {len(df_runs)}")

print("–––––  These are the activities that we have actual data for –––––" )
print(f"All runs with detailed data: {df_all_runs['filenumber'].nunique()}")

print(f"All runs in 'df_runs_dist_zones': {len(df_runs_dist_zones) }")
print(f"All runs in 'df_runs_min_zones': {len(df_runs_min_zones) }")


––––– The original file df_runs contains all running activities. 38 activities, however, where not properly recorded on a fitness watch and have therefore no actual detailed data –––––
All runs ever recorded: 379
–––––  These are the activities that we have actual data for –––––
All runs with detailed data: 341
All runs in 'df_runs_dist_zones': 341
All runs in 'df_runs_min_zones': 341
