# Joining Activity Data & Sleep Data

In [1]:
# Setup 

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

pd.options.display.max_rows = 6000
pd.options.display.max_columns = 6000

# Read Data and Create Data Frames
df_phys = pd.read_csv('../data/df_phys.csv')
df_swims = pd.read_csv('../data/df_swims.csv')
df_bikes = pd.read_csv('../data/df_bikes.csv')
df_runs = pd.read_csv('../data/df_runs.csv')
df_activs = pd.read_csv('../data/df_activs.csv')

df_runs_dist_zones = pd.read_csv('../data/df_runs_dist_zones.csv')
df_runs_min_zones = pd.read_csv('../data/df_runs_min_zones.csv')

df_runs_dist_zones["filenumber"] = df_runs_dist_zones["filenumber"].astype(object)
df_runs_min_zones["filenumber"] = df_runs_min_zones["filenumber"].astype(object)

In [2]:
# Creating "filenumber" columns in order to merge the dataframes with heart rate zones etc. from the individually created activities
df_swims["filenumber"] = df_swims["filename"].str.extract(r'(\d+)')[0]  #+ ".fit"
df_bikes["filenumber"] = df_bikes["filename"].str.extract(r'(\d+)')[0]  #+ ".fit"
df_runs["filenumber"] = df_runs["filename"].str.extract(r'(\d+)')[0]  #+ ".fit"
df_activs["filenumber"] = df_activs["filename"].str.extract(r'(\d+)')[0]  #+ ".fit"


In [3]:
'''df_swims["filenumber"] = df_swims["filenumber"].astype(float)
df_bikes["filenumber"] = df_bikes["filenumber"].astype(float)
df_runs["filenumber"] = df_runs["filenumber"].astype(float)
df_activs["filenumber"] = df_activs["filenumber"].astype(float)'''

'df_swims["filenumber"] = df_swims["filenumber"].astype(float)\ndf_bikes["filenumber"] = df_bikes["filenumber"].astype(float)\ndf_runs["filenumber"] = df_runs["filenumber"].astype(float)\ndf_activs["filenumber"] = df_activs["filenumber"].astype(float)'

## Joining All Activities with Heart Rate Zones (per minute & per distance)

#### Preparation before merging 

In [4]:
# training mode is a empty column that we want to fill, but we need to drop it first before recreating it in the next steps as filled column
df_activs = df_activs.drop(["training mode",], axis=1)

In [5]:
# In order to merge the two dataframes on filenumber we need the column to be formatted as integer but we cannot reformat it as long as there are some NaNs in there.
# So we fill them with 0 for the meantime
df_activs["filenumber"] = df_activs["filenumber"].fillna(0)

#### Merging `df_activs` with `df_runs_min_zones`

In [6]:
# casting both filenumber columns to integer so that we can merge on those columns

df_activs["filenumber"] = df_activs["filenumber"].astype(int)
df_runs_min_zones["filenumber"] = df_runs_min_zones["filenumber"].astype(int)

In [7]:
df_activs = pd.merge(df_activs, df_runs_min_zones, on='filenumber', how="left")

In [8]:
# checking if the merger was successfull
x = df_runs_min_zones[df_runs_min_zones["training mode"] == "high int"]
print(len(x))
y = df_activs[df_activs["training mode"] == "high int"]
print(len(y))

253
253


#### Merging `df_activs` with `df_runs_dist_zones`

In [9]:
# casting both filenumber columns to integer so that we can merge on those columns

df_activs["filenumber"] = df_activs["filenumber"].astype(int)
df_runs_dist_zones["filenumber"] = df_runs_dist_zones["filenumber"].astype(int)

In [10]:
df_activs = pd.merge(df_activs, df_runs_dist_zones, on='filenumber', how="left")

In [11]:
# checking if the merger was successfull
i = df_runs_dist_zones[df_runs_dist_zones["dist z3"] > 0]
print(len(i))
j = df_activs[df_activs["dist z3"]  > 0]
print(len(j))

300
300


## Joining Run Activities with Heart Rate Zones (per minute & per distance)

For runs, we created several new columns that label the activities as high or low intensity based on the minutes spend in each heart rate zone. Further, we calculated the distance spend in each heart rate zones. 

These columns are currently stored in the dataframes `df_runs_dist_zones` and `df_runs_min_zones`and need to be merged with `df_activs_plus`

#### Preparation before merging 

In [12]:
df_runs_plus = df_runs

In [13]:
# training mode is a empty column that we want to fill, but we need to drop it first before recreating it in the next steps as filled column
df_runs_plus = df_runs_plus.drop(["training mode",], axis=1)

In [14]:
# In order to merge the two dataframes on filenumber we need the column to be formatted as integer but we cannot reformat it as long as there are some NaNs in there.
# So we fill them with 0 for the meantime
df_runs_plus["filenumber"] = df_runs_plus["filenumber"].fillna(0)

In [15]:
# Checking if the filenumbers in both dataframes are unique 
print(df_runs_min_zones["filenumber"].nunique(), len(df_runs_min_zones))

print(df_runs_plus["filenumber"].nunique(), len(df_runs_plus))

341 341
375 379


As it appears, the dataframe `df_runs_plus` has 5 duplicate filenumbers

In [16]:
'''
df_runs_plus_NaNs = df_runs_plus[df_runs_plus['filenumber'].isnull()]
print(f"there are {len(df_runs_plus_NaNs)} duplicates in df_runs_plus and they all have NaNs as filename")
'''

'\ndf_runs_plus_NaNs = df_runs_plus[df_runs_plus[\'filenumber\'].isnull()]\nprint(f"there are {len(df_runs_plus_NaNs)} duplicates in df_runs_plus and they all have NaNs as filename")\n'

These duplicates are 5 entries, that have not been recorded on the fitness watch and therefore have no files attached and have therefore no filenumber. Which is why they hinder the merging. 

Hence we drop them for now in order to perform the merger and then we will concat these entries later again

In [17]:
'''
# dropping NaN filenumbers
df_runs_plus = df_runs_plus.drop_duplicates(subset=["filenumber"], keep=False)
'''

'\n# dropping NaN filenumbers\ndf_runs_plus = df_runs_plus.drop_duplicates(subset=["filenumber"], keep=False)\n'

In [18]:
# Checking again if after our intervention the filenumbers in both dataframes are unique 
print("df_runs_min_zones unique filenumbers compared to it's length")
print(df_runs_min_zones["filenumber"].nunique(), len(df_runs_min_zones))

print("df_runs_plus unique filenumbers compared to it's length")
print(df_runs_plus["filenumber"].nunique(), len(df_runs_plus))

df_runs_min_zones unique filenumbers compared to it's length
341 341
df_runs_plus unique filenumbers compared to it's length
375 379


In [19]:
# casting both filenumber columns to integer so that we can merge on those columns

df_runs_plus["filenumber"] = df_runs_plus["filenumber"].astype(int)
df_runs_min_zones["filenumber"] = df_runs_min_zones["filenumber"].astype(int)

#### Merging `df_runs_plus` with `df_runs_min_zones`

In [20]:
df_runs_plus = pd.merge(df_runs_plus, df_runs_min_zones, on='filenumber', how="left")

In [21]:
# checking if the merger was successfull
x = df_runs_min_zones[df_runs_min_zones["training mode"] == "high int"]
print(len(x))
y = df_runs_plus[df_runs_plus["training mode"] == "high int"]
print(len(y))

253
253


#### Merging `df_runs_plus` with `df_runs_dist_zones`

In [22]:
# casting both filenumber columns to integer so that we can merge on those columns
df_runs_plus["filenumber"] = df_runs_plus["filenumber"].astype(int)
df_runs_dist_zones["filenumber"] = df_runs_dist_zones["filenumber"].astype(int)

In [23]:
df_runs_plus = pd.merge(df_runs_plus, df_runs_dist_zones, on='filenumber', how="left")

In [24]:
# checking if the merger was successfull
x = df_runs_dist_zones[df_runs_dist_zones["dist z5"] > 0]
print(len(x))
y = df_runs_plus[df_runs_plus["dist z5"] > 0]
print(len(y))

33
33


## OMMITTED Inserting the enhanced Run Activities into the `df_activs_plus`

In [25]:

''' 

df_activs_minus_runs = df_activs[df_activs["sport"]!= "Run"]

df_activs_minus_runs = df_activs_minus_runs.drop(["training mode",], axis=1)

''' 

' \n\ndf_activs_minus_runs = df_activs[df_activs["sport"]!= "Run"]\n\ndf_activs_minus_runs = df_activs_minus_runs.drop(["training mode",], axis=1)\n\n'

In [26]:
''' 
# Adding columns two both dataframes so that they have the same columns and so that I can merge them

df_runs_plus["swim pace"] = None
df_runs_plus["bike pace"] = None
df_runs_plus["pool length"] = None
df_runs_plus["avg watts"] = None


df_activs_minus_runs["run pace"] = None
df_activs_minus_runs["grade adjusted distance"] = None
df_activs_minus_runs["avg grade adjusted pace"] = None
df_activs_minus_runs["below zones"] = None
df_activs_minus_runs["no hr collected"] = None
df_activs_minus_runs["z1"] = None
df_activs_minus_runs["z2"] = None
df_activs_minus_runs["z3"] = None
df_activs_minus_runs["z3"] = None
df_activs_minus_runs["z4"] = None
df_activs_minus_runs["z5"] = None
df_activs_minus_runs["time low zones"] = None
df_activs_minus_runs["training mode"] = None
df_activs_minus_runs["time high zones"] = None
df_activs_minus_runs["dist below zones"] = None
df_activs_minus_runs["dist no hr collected"] = None
df_activs_minus_runs["dist z1"] = None
df_activs_minus_runs["dist z2"] = None
df_activs_minus_runs["dist z3"] = None
df_activs_minus_runs["dist z4"] = None
df_activs_minus_runs["dist z5"] = None
''' 

' \n# Adding columns two both dataframes so that they have the same columns and so that I can merge them\n\ndf_runs_plus["swim pace"] = None\ndf_runs_plus["bike pace"] = None\ndf_runs_plus["pool length"] = None\ndf_runs_plus["avg watts"] = None\n\n\ndf_activs_minus_runs["run pace"] = None\ndf_activs_minus_runs["grade adjusted distance"] = None\ndf_activs_minus_runs["avg grade adjusted pace"] = None\ndf_activs_minus_runs["below zones"] = None\ndf_activs_minus_runs["no hr collected"] = None\ndf_activs_minus_runs["z1"] = None\ndf_activs_minus_runs["z2"] = None\ndf_activs_minus_runs["z3"] = None\ndf_activs_minus_runs["z3"] = None\ndf_activs_minus_runs["z4"] = None\ndf_activs_minus_runs["z5"] = None\ndf_activs_minus_runs["time low zones"] = None\ndf_activs_minus_runs["training mode"] = None\ndf_activs_minus_runs["time high zones"] = None\ndf_activs_minus_runs["dist below zones"] = None\ndf_activs_minus_runs["dist no hr collected"] = None\ndf_activs_minus_runs["dist z1"] = None\ndf_activs_mi

In [27]:
''' 

# rearranging columns

df_activs_minus_runs = df_activs_minus_runs[[ 'activity id', 'date', 'activity date', 'sport', 'activity type',
       'start time', 'moving time', 'elapsed time', 'finish time', 'swim pace',
       'bike pace', 'run pace', 'distance', 'avg heart rate',
       'avg heart rate zone', 'max heart rate', 'max heart rate yearly',
       'avg speed', 'max speed', 'avg elapsed speed', 'relative effort',
       'calories', 'pool length', 'elevation gain', 'elevation loss',
       'elevation low', 'elevation high', 'max grade', 'avg grade',
       'avg watts', 'avg cadence', 'max cadence', 'activity name',
       'activity description', 'perceived exertion', 'event',
       'injury description', 'filename', 'filenumber',
       'grade adjusted distance', 'avg grade adjusted pace', 'below zones',
       'no hr collected', 'z1', 'z2', 'z3', 'z4', 'z5', 'time low zones',
       'training mode', 'time high zones', 'dist below zones',
       'dist no hr collected', 'dist z1', 'dist z2', 'dist z3', 'dist z4',
       'dist z5'
 ]]

''' 

" \n\n# rearranging columns\n\ndf_activs_minus_runs = df_activs_minus_runs[[ 'activity id', 'date', 'activity date', 'sport', 'activity type',\n       'start time', 'moving time', 'elapsed time', 'finish time', 'swim pace',\n       'bike pace', 'run pace', 'distance', 'avg heart rate',\n       'avg heart rate zone', 'max heart rate', 'max heart rate yearly',\n       'avg speed', 'max speed', 'avg elapsed speed', 'relative effort',\n       'calories', 'pool length', 'elevation gain', 'elevation loss',\n       'elevation low', 'elevation high', 'max grade', 'avg grade',\n       'avg watts', 'avg cadence', 'max cadence', 'activity name',\n       'activity description', 'perceived exertion', 'event',\n       'injury description', 'filename', 'filenumber',\n       'grade adjusted distance', 'avg grade adjusted pace', 'below zones',\n       'no hr collected', 'z1', 'z2', 'z3', 'z4', 'z5', 'time low zones',\n       'training mode', 'time high zones', 'dist below zones',\n       'dist no hr co

In [28]:
''' 
df_runs_plus = df_runs_plus[[ 'activity id', 'date', 'activity date', 'sport', 'activity type',
       'start time', 'moving time', 'elapsed time', 'finish time', 'swim pace',
       'bike pace', 'run pace', 'distance', 'avg heart rate',
       'avg heart rate zone', 'max heart rate', 'max heart rate yearly',
       'avg speed', 'max speed', 'avg elapsed speed', 'relative effort',
       'calories', 'pool length', 'elevation gain', 'elevation loss',
       'elevation low', 'elevation high', 'max grade', 'avg grade',
       'avg watts', 'avg cadence', 'max cadence', 'activity name',
       'activity description', 'perceived exertion', 'event',
       'injury description', 'filename', 'filenumber',
       'grade adjusted distance', 'avg grade adjusted pace', 'below zones',
       'no hr collected', 'z1', 'z2', 'z3', 'z4', 'z5', 'time low zones',
       'training mode', 'time high zones', 'dist below zones',
       'dist no hr collected', 'dist z1', 'dist z2', 'dist z3', 'dist z4',
       'dist z5'
 ]]

 ''' 

" \ndf_runs_plus = df_runs_plus[[ 'activity id', 'date', 'activity date', 'sport', 'activity type',\n       'start time', 'moving time', 'elapsed time', 'finish time', 'swim pace',\n       'bike pace', 'run pace', 'distance', 'avg heart rate',\n       'avg heart rate zone', 'max heart rate', 'max heart rate yearly',\n       'avg speed', 'max speed', 'avg elapsed speed', 'relative effort',\n       'calories', 'pool length', 'elevation gain', 'elevation loss',\n       'elevation low', 'elevation high', 'max grade', 'avg grade',\n       'avg watts', 'avg cadence', 'max cadence', 'activity name',\n       'activity description', 'perceived exertion', 'event',\n       'injury description', 'filename', 'filenumber',\n       'grade adjusted distance', 'avg grade adjusted pace', 'below zones',\n       'no hr collected', 'z1', 'z2', 'z3', 'z4', 'z5', 'time low zones',\n       'training mode', 'time high zones', 'dist below zones',\n       'dist no hr collected', 'dist z1', 'dist z2', 'dist z3', 

In [29]:
''' 

df_activs2 = pd.merge(df_activs_minus_runs, df_runs_plus, on='activity id', how="outer")

''' 

' \n\ndf_activs2 = pd.merge(df_activs_minus_runs, df_runs_plus, on=\'activity id\', how="outer")\n\n'

## Joining Activities with Physical Data

In [30]:
df_swims_plus = pd.merge(df_swims, df_phys, on='date', how="left")
df_bikes_plus = pd.merge(df_bikes, df_phys, on='date', how="left")
df_runs_plus = pd.merge(df_runs_plus, df_phys, on='date', how="left")
df_activs_plus = pd.merge(df_activs, df_phys, on='date', how="left")

#set indeces
#df_swims_plus.set_index("activity id", inplace= True)
#df_bikes_plus.set_index("activity id", inplace= True)
#df_runs_plus.set_index("activity id", inplace= True)
#df_activs_plus.set_index("activity id", inplace= True)

In [31]:
df_activs_plus["moving time"].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1038 entries, 0 to 1037
Series name: moving time
Non-Null Count  Dtype 
--------------  ----- 
1038 non-null   object
dtypes: object(1)
memory usage: 8.2+ KB


In [32]:
df_activs_plus["moving time"] = pd.to_timedelta(df_activs_plus["moving time"] )

df_activs_plus["moving time"].info()


<class 'pandas.core.series.Series'>
RangeIndex: 1038 entries, 0 to 1037
Series name: moving time
Non-Null Count  Dtype          
--------------  -----          
1038 non-null   timedelta64[ns]
dtypes: timedelta64[ns](1)
memory usage: 8.2 KB


In [33]:
df_activs_plus["moving time seconds"] = df_activs_plus["moving time"].dt.total_seconds()

In [35]:
df_activs_plus[["moving time seconds", "moving time"]]

Unnamed: 0,moving time seconds,moving time
0,1835.0,0 days 00:30:35
1,1770.0,0 days 00:29:30
2,1250.0,0 days 00:20:50
3,1805.0,0 days 00:30:05
4,1205.0,0 days 00:20:05
5,1935.0,0 days 00:32:15
6,1706.0,0 days 00:28:26
7,2384.0,0 days 00:39:44
8,1227.0,0 days 00:20:27
9,1385.0,0 days 00:23:05


In [37]:
df_activs_plus["moving time"].sum()

Timedelta('45 days 12:07:18')

## Saving to .csv

In [36]:
# saving to csv
'''
df_swims_plus.to_csv('../data/df_swims_plus.csv', index=False)  
df_bikes_plus.to_csv('../data/df_bikes_plus.csv', index=False)  
df_runs_plus.to_csv('../data/df_runs_plus.csv', index=False)  
df_activs_plus.to_csv('../data/df_activs_plus.csv', index=False)  


'''

'\n'