In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score as r2 # evaluation metric
pd.set_option('display.max_columns', None)

In [2]:
# Date format to infer the format when import dataset
date_format = '%m/%d/%Y %I:%M:%S %p'

# Reading the dataframe
df_minutes_calories = pd.read_csv('../Data/minuteCaloriesNarrow_merged.csv', parse_dates=['ActivityMinute'], date_format=date_format)
df_minutes_intensities = pd.read_csv('../Data/minuteIntensitiesNarrow_merged.csv', parse_dates=['ActivityMinute'], date_format=date_format)
df_minutes_METs = pd.read_csv('../Data/minuteMETsNarrow_merged.csv', parse_dates=['ActivityMinute'], date_format=date_format)
df_minutes_sleep = pd.read_csv('../Data/minuteSleep_merged.csv', parse_dates=['date'], date_format=date_format)
df_minutes_step = pd.read_csv('../Data/minuteStepsNarrow_merged.csv', parse_dates=['ActivityMinute'], date_format=date_format)
df_seconds_heartrate = pd.read_csv('../Data/heartrate_seconds_merged.csv', parse_dates=['Time'], date_format=date_format)

In [3]:
# Defining column names to merge on 
merge_on_cols = ['Id', 'ActivityMinute']

# To merge multiple dataframes into one on specific columns
df_fitbit_data_minute = df_minutes_calories.merge(df_minutes_intensities, on=merge_on_cols) \
                        .merge(df_minutes_METs, on=merge_on_cols) \
                        .merge(df_minutes_step, on=merge_on_cols) \
                        .merge(df_minutes_sleep, left_on=merge_on_cols, right_on=['Id', 'date'], how='left')

# Fill the null values with 0
df_fitbit_data_minute.fillna({'value':0, 'logId':0}, inplace=True)
# Rename the columns values to be more specific
df_fitbit_data_minute.rename(columns={'value': 'Sleep_Value'}, inplace=True)

In [4]:
# Displaying a sample of the dataset
df_fitbit_data_minute.head()

Unnamed: 0,Id,ActivityMinute,Calories,Intensity,METs,Steps,date,Sleep_Value,logId
0,1503960366,2016-04-12 00:00:00,0.7865,0,10,0,NaT,0.0,0.0
1,1503960366,2016-04-12 00:01:00,0.7865,0,10,0,NaT,0.0,0.0
2,1503960366,2016-04-12 00:02:00,0.7865,0,10,0,NaT,0.0,0.0
3,1503960366,2016-04-12 00:03:00,0.7865,0,10,0,NaT,0.0,0.0
4,1503960366,2016-04-12 00:04:00,0.7865,0,10,0,NaT,0.0,0.0


In [5]:
# To remove the seconds from the time column
df_seconds_heartrate['Date_Time'] = df_seconds_heartrate['Time'].dt.floor('T')

In [6]:
# Aggregating the hearrate avg per minute
df_heartrate = df_seconds_heartrate.groupby(['Id', 'Date_Time'])['Value'].mean().reset_index()

In [7]:
# Displaying a sample of the dataset
df_heartrate

Unnamed: 0,Id,Date_Time,Value
0,2022484408,2016-04-12 07:21:00,101.600000
1,2022484408,2016-04-12 07:22:00,87.888889
2,2022484408,2016-04-12 07:23:00,58.000000
3,2022484408,2016-04-12 07:24:00,58.000000
4,2022484408,2016-04-12 07:25:00,56.777778
...,...,...,...
333415,8877689391,2016-05-12 14:40:00,56.222222
333416,8877689391,2016-05-12 14:41:00,57.857143
333417,8877689391,2016-05-12 14:42:00,56.000000
333418,8877689391,2016-05-12 14:43:00,57.500000


In [8]:
# To merge multiple dataframes into one on specific columns
df_heartrate = df_heartrate.merge(df_fitbit_data_minute, left_on=['Id', 'Date_Time'], right_on=['Id', 'ActivityMinute'], how='inner')

In [9]:
# Displaying a sample of the dataset
df_heartrate

Unnamed: 0,Id,Date_Time,Value,ActivityMinute,Calories,Intensity,METs,Steps,date,Sleep_Value,logId
0,2022484408,2016-04-12 07:21:00,101.600000,2016-04-12 07:21:00,3.32064,1,32,17,NaT,0.0,0.0
1,2022484408,2016-04-12 07:22:00,87.888889,2016-04-12 07:22:00,3.94326,1,38,9,NaT,0.0,0.0
2,2022484408,2016-04-12 07:23:00,58.000000,2016-04-12 07:23:00,1.34901,0,13,0,NaT,0.0,0.0
3,2022484408,2016-04-12 07:24:00,58.000000,2016-04-12 07:24:00,1.03770,0,10,0,NaT,0.0,0.0
4,2022484408,2016-04-12 07:25:00,56.777778,2016-04-12 07:25:00,1.03770,0,10,0,NaT,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
333141,8877689391,2016-05-12 13:55:00,60.666667,2016-05-12 13:55:00,1.33353,0,11,0,NaT,0.0,0.0
333142,8877689391,2016-05-12 13:56:00,61.875000,2016-05-12 13:56:00,1.33353,0,11,0,NaT,0.0,0.0
333143,8877689391,2016-05-12 13:57:00,58.142857,2016-05-12 13:57:00,1.33353,0,11,0,NaT,0.0,0.0
333144,8877689391,2016-05-12 13:58:00,61.200000,2016-05-12 13:58:00,1.33353,0,11,0,NaT,0.0,0.0


In [10]:
# To drop unnecessary column
df_heartrate.drop(columns=['date','ActivityMinute'], inplace=True)
# To rename columns to secific names
df_heartrate.rename(columns={'Value': 'HeartRate'}, inplace=True)

In [11]:
# Set the X variable
X = df_heartrate[['Id', 'Calories', 'Intensity', 'METs', 'Steps', 'Sleep_Value', 'logId']].copy()
# Set the y variable
y = df_heartrate['HeartRate']

In [12]:
# Define the multinomial logistic regression model
model = LinearRegression()
# Fitting variables to the model
model.fit(X, y)
# Predict the dependent variable
predicted_heartrate = model.predict(X)

In [13]:
# Calculating the accuracy score
score = r2(y,predicted_heartrate)
round(score,2)*100

62.0

In [14]:
# Set the X variable for the actual values from dataframe
X_actual = df_fitbit_data_minute[['Id', 'Calories', 'Intensity', 'METs', 'Steps', 'Sleep_Value', 'logId']].copy()

In [15]:
# Displaying a sample of the dataset
X_actual.head()

Unnamed: 0,Id,Calories,Intensity,METs,Steps,Sleep_Value,logId
0,1503960366,0.7865,0,10,0,0.0,0.0
1,1503960366,0.7865,0,10,0,0.0,0.0
2,1503960366,0.7865,0,10,0,0.0,0.0
3,1503960366,0.7865,0,10,0,0.0,0.0
4,1503960366,0.7865,0,10,0,0.0,0.0


In [16]:
# Predict the actual dependent variable
predicted_heartrate_actual = model.predict(X_actual)
# Create a new column in the fitbit dataset for heartrate
df_fitbit_data_minute['HeartRate'] = predicted_heartrate_actual
# Round the values in the heartrate column
df_fitbit_data_minute['HeartRate'] = df_fitbit_data_minute['HeartRate'].round(2)
# Drop the date column which is a duplicate column
df_fitbit_data_minute.drop(columns={'date'}, inplace=True)

In [17]:
df_fitbit_data_minute.dtypes

Id                         int64
ActivityMinute    datetime64[ns]
Calories                 float64
Intensity                  int64
METs                       int64
Steps                      int64
Sleep_Value              float64
logId                    float64
HeartRate                float64
dtype: object

In [18]:
# Define the data field from datetime
df_fitbit_data_minute['Date'] = df_fitbit_data_minute['ActivityMinute'].dt.date.astype(str)
# Define the time field from datetime
df_fitbit_data_minute['Time'] = df_fitbit_data_minute['ActivityMinute'].dt.time.astype(str)

# Drop the unwanted column
df_fitbit_data_minute.drop(columns=['ActivityMinute'], inplace=True)

# Reorder the attributes of the dataframe
df_fitbit_data_minute = df_fitbit_data_minute[['Id', 'Date', 'Time', 'Calories', 'Intensity', 'METs', 'Steps', 'Sleep_Value', 'logId', 'HeartRate']]

In [19]:
# Displaying a sample of the dataframe
df_fitbit_data_minute.head()

Unnamed: 0,Id,Date,Time,Calories,Intensity,METs,Steps,Sleep_Value,logId,HeartRate
0,1503960366,2016-04-12,00:00:00,0.7865,0,10,0,0.0,0.0,68.41
1,1503960366,2016-04-12,00:01:00,0.7865,0,10,0,0.0,0.0,68.41
2,1503960366,2016-04-12,00:02:00,0.7865,0,10,0,0.0,0.0,68.41
3,1503960366,2016-04-12,00:03:00,0.7865,0,10,0,0.0,0.0,68.41
4,1503960366,2016-04-12,00:04:00,0.7865,0,10,0,0.0,0.0,68.41


In [20]:
# To save the dataframe as a csv file
df_fitbit_data_minute.to_csv('../Data/FitbitdataMinute.csv')