---
* title: "Fitbit Data Analysis Processing"
* author: "Alvin Aw Yong"
* date: "2024-11-04"
* output: html_document

--- 
### Process

#### Setup
----

In [6]:
# Install necessary packages if not already installed
if (!require(dplyr)) install.packages("dplyr")
if (!require(ggplot2)) install.packages("ggplot2")
if (!require(tidyverse)) install.packages("tidyverse")
if (!require(scales)) install.packages("scales")

# Load libraries
library(dplyr)
library(ggplot2)
library(tidyr)
library(scales)

Loading required package: dplyr


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: ggplot2

Loading required package: tidyverse

── [1mAttaching core tidyverse packages[22m ──────────────────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mreadr    [39m 2.1.5     
── [1mConflicts[22m ────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<

----
#### Import Data
----

In [1]:
# Import CSV files
daily_activity <- read.csv("dailyActivity_merged.csv")
daily_calorie <- read.csv("dailyCalories_merged.csv")
sleep_data <- read.csv("sleepDay_merged.csv")
daily_step <- read.csv("dailySteps_merged.csv")
heart_rate <- read.csv("heartrate_seconds_merged.csv")
daily_intensity <- read.csv("dailyIntensities_merged.csv")

# Display the first few rows of each dataset to confirm successful import
list(
  daily_activity = head(daily_activity),
  daily_calorie = head(daily_calorie),
  sleep_data = head(sleep_data),
  daily_step = head(daily_step),
  heart_rate = head(heart_rate),
  daily_intensity = head(daily_intensity)
  
)

Unnamed: 0_level_0,Id,ActivityDate,TotalSteps,TotalDistance,TrackerDistance,LoggedActivitiesDistance,VeryActiveDistance,ModeratelyActiveDistance,LightActiveDistance,SedentaryActiveDistance,VeryActiveMinutes,FairlyActiveMinutes,LightlyActiveMinutes,SedentaryMinutes,Calories
Unnamed: 0_level_1,<dbl>,<chr>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>
1,1503960366,4/12/2016,13162,8.5,8.5,0,1.88,0.55,6.06,0,25,13,328,728,1985
2,1503960366,4/13/2016,10735,6.97,6.97,0,1.57,0.69,4.71,0,21,19,217,776,1797
3,1503960366,4/14/2016,10460,6.74,6.74,0,2.44,0.4,3.91,0,30,11,181,1218,1776
4,1503960366,4/15/2016,9762,6.28,6.28,0,2.14,1.26,2.83,0,29,34,209,726,1745
5,1503960366,4/16/2016,12669,8.16,8.16,0,2.71,0.41,5.04,0,36,10,221,773,1863
6,1503960366,4/17/2016,9705,6.48,6.48,0,3.19,0.78,2.51,0,38,20,164,539,1728

Unnamed: 0_level_0,Id,ActivityDay,Calories
Unnamed: 0_level_1,<dbl>,<chr>,<int>
1,1503960366,4/12/2016,1985
2,1503960366,4/13/2016,1797
3,1503960366,4/14/2016,1776
4,1503960366,4/15/2016,1745
5,1503960366,4/16/2016,1863
6,1503960366,4/17/2016,1728

Unnamed: 0_level_0,Id,SleepDay,TotalSleepRecords,TotalMinutesAsleep,TotalTimeInBed
Unnamed: 0_level_1,<dbl>,<chr>,<int>,<int>,<int>
1,1503960366,4/12/2016 12:00:00 AM,1,327,346
2,1503960366,4/13/2016 12:00:00 AM,2,384,407
3,1503960366,4/15/2016 12:00:00 AM,1,412,442
4,1503960366,4/16/2016 12:00:00 AM,2,340,367
5,1503960366,4/17/2016 12:00:00 AM,1,700,712
6,1503960366,4/19/2016 12:00:00 AM,1,304,320

Unnamed: 0_level_0,Id,ActivityDay,StepTotal
Unnamed: 0_level_1,<dbl>,<chr>,<int>
1,1503960366,4/12/2016,13162
2,1503960366,4/13/2016,10735
3,1503960366,4/14/2016,10460
4,1503960366,4/15/2016,9762
5,1503960366,4/16/2016,12669
6,1503960366,4/17/2016,9705

Unnamed: 0_level_0,Id,Time,Value
Unnamed: 0_level_1,<dbl>,<chr>,<int>
1,2022484408,4/12/2016 7:21:00 AM,97
2,2022484408,4/12/2016 7:21:05 AM,102
3,2022484408,4/12/2016 7:21:10 AM,105
4,2022484408,4/12/2016 7:21:20 AM,103
5,2022484408,4/12/2016 7:21:25 AM,101
6,2022484408,4/12/2016 7:22:05 AM,95

Unnamed: 0_level_0,Id,ActivityDay,SedentaryMinutes,LightlyActiveMinutes,FairlyActiveMinutes,VeryActiveMinutes,SedentaryActiveDistance,LightActiveDistance,ModeratelyActiveDistance,VeryActiveDistance
Unnamed: 0_level_1,<dbl>,<chr>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>
1,1503960366,4/12/2016,728,328,13,25,0,6.06,0.55,1.88
2,1503960366,4/13/2016,776,217,19,21,0,4.71,0.69,1.57
3,1503960366,4/14/2016,1218,181,11,30,0,3.91,0.4,2.44
4,1503960366,4/15/2016,726,209,34,29,0,2.83,1.26,2.14
5,1503960366,4/16/2016,773,221,10,36,0,5.04,0.41,2.71
6,1503960366,4/17/2016,539,164,20,38,0,2.51,0.78,3.19


---
#### Inspect Data
---

In [3]:
# Print a header and summary for daily_activity
cat("\t\tDaily Activity\n\n")
summary(daily_activity)
cat("_____________________________________________________________\n\n")

# Print a header and summary for sleep_data
cat("\t\tSleep Data\n\n")
summary(sleep_data)
cat("_____________________________________________________________\n\n")

# Print a header and summary for daily_calorie
cat("\t\tDaily Calorie\n\n")
summary(daily_calorie)
cat("_____________________________________________________________\n\n")

# Print a header and summary for daily_intensity
cat("\t\tDaily Intensity\n\n")
summary(daily_intensity)
cat("_____________________________________________________________\n\n")

# Print a header and summary for heart_rate
cat("\t\tHeart Rate\n\n")
summary(heart_rate)
cat("_____________________________________________________________\n\n")

# Print a header and summary for daily_step
cat("\t\tDaily Step\n\n")
summary(daily_step)
cat("_____________________________________________________________\n\n")


		Daily Activity



       Id            ActivityDate         TotalSteps    TotalDistance   
 Min.   :1.504e+09   Length:940         Min.   :    0   Min.   : 0.000  
 1st Qu.:2.320e+09   Class :character   1st Qu.: 3790   1st Qu.: 2.620  
 Median :4.445e+09   Mode  :character   Median : 7406   Median : 5.245  
 Mean   :4.855e+09                      Mean   : 7638   Mean   : 5.490  
 3rd Qu.:6.962e+09                      3rd Qu.:10727   3rd Qu.: 7.713  
 Max.   :8.878e+09                      Max.   :36019   Max.   :28.030  
 TrackerDistance  LoggedActivitiesDistance VeryActiveDistance
 Min.   : 0.000   Min.   :0.0000           Min.   : 0.000    
 1st Qu.: 2.620   1st Qu.:0.0000           1st Qu.: 0.000    
 Median : 5.245   Median :0.0000           Median : 0.210    
 Mean   : 5.475   Mean   :0.1082           Mean   : 1.503    
 3rd Qu.: 7.710   3rd Qu.:0.0000           3rd Qu.: 2.053    
 Max.   :28.030   Max.   :4.9421           Max.   :21.920    
 ModeratelyActiveDistance LightActiveDistance Sedentary

_____________________________________________________________

		Sleep Data



       Id              SleepDay         TotalSleepRecords TotalMinutesAsleep
 Min.   :1.504e+09   Length:413         Min.   :1.000     Min.   : 58.0     
 1st Qu.:3.977e+09   Class :character   1st Qu.:1.000     1st Qu.:361.0     
 Median :4.703e+09   Mode  :character   Median :1.000     Median :433.0     
 Mean   :5.001e+09                      Mean   :1.119     Mean   :419.5     
 3rd Qu.:6.962e+09                      3rd Qu.:1.000     3rd Qu.:490.0     
 Max.   :8.792e+09                      Max.   :3.000     Max.   :796.0     
 TotalTimeInBed 
 Min.   : 61.0  
 1st Qu.:403.0  
 Median :463.0  
 Mean   :458.6  
 3rd Qu.:526.0  
 Max.   :961.0  

_____________________________________________________________

		Daily Calorie



       Id            ActivityDay           Calories   
 Min.   :1.504e+09   Length:940         Min.   :   0  
 1st Qu.:2.320e+09   Class :character   1st Qu.:1828  
 Median :4.445e+09   Mode  :character   Median :2134  
 Mean   :4.855e+09                      Mean   :2304  
 3rd Qu.:6.962e+09                      3rd Qu.:2793  
 Max.   :8.878e+09                      Max.   :4900  

_____________________________________________________________

		Daily Intensity



       Id            ActivityDay        SedentaryMinutes LightlyActiveMinutes
 Min.   :1.504e+09   Length:940         Min.   :   0.0   Min.   :  0.0       
 1st Qu.:2.320e+09   Class :character   1st Qu.: 729.8   1st Qu.:127.0       
 Median :4.445e+09   Mode  :character   Median :1057.5   Median :199.0       
 Mean   :4.855e+09                      Mean   : 991.2   Mean   :192.8       
 3rd Qu.:6.962e+09                      3rd Qu.:1229.5   3rd Qu.:264.0       
 Max.   :8.878e+09                      Max.   :1440.0   Max.   :518.0       
 FairlyActiveMinutes VeryActiveMinutes SedentaryActiveDistance
 Min.   :  0.00      Min.   :  0.00    Min.   :0.000000       
 1st Qu.:  0.00      1st Qu.:  0.00    1st Qu.:0.000000       
 Median :  6.00      Median :  4.00    Median :0.000000       
 Mean   : 13.56      Mean   : 21.16    Mean   :0.001606       
 3rd Qu.: 19.00      3rd Qu.: 32.00    3rd Qu.:0.000000       
 Max.   :143.00      Max.   :210.00    Max.   :0.110000       
 LightActiveD

_____________________________________________________________

		Heart Rate



       Id                Time               Value       
 Min.   :2.022e+09   Length:2483658     Min.   : 36.00  
 1st Qu.:4.388e+09   Class :character   1st Qu.: 63.00  
 Median :5.554e+09   Mode  :character   Median : 73.00  
 Mean   :5.514e+09                      Mean   : 77.33  
 3rd Qu.:6.962e+09                      3rd Qu.: 88.00  
 Max.   :8.878e+09                      Max.   :203.00  

_____________________________________________________________

		Daily Step



       Id            ActivityDay          StepTotal    
 Min.   :1.504e+09   Length:940         Min.   :    0  
 1st Qu.:2.320e+09   Class :character   1st Qu.: 3790  
 Median :4.445e+09   Mode  :character   Median : 7406  
 Mean   :4.855e+09                      Mean   : 7638  
 3rd Qu.:6.962e+09                      3rd Qu.:10727  
 Max.   :8.878e+09                      Max.   :36019  

_____________________________________________________________



---
#### Data cleaning
----

In [10]:
# Drop missing values like NA 
daily_activity <- daily_activity %>% drop_na()
sleep_data <- sleep_data %>% drop_na()
heart_rate <- heart_rate %>% drop_na()
daily_calorie <- daily_calorie %>% drop_na()
daily_intensity <- daily_intensity %>% drop_na()
daily_step <- daily_step %>% drop_na()

In [11]:
# Make all column name in the same format
colnames(daily_activity) <- tolower(gsub(" ", "_", colnames(daily_activity)))
colnames(sleep_data) <- tolower(gsub(" ", "_", colnames(sleep_data)))
colnames(daily_intensity) <- tolower(gsub(" ", "_", colnames(daily_intensity)))
colnames(heart_rate) <- tolower(gsub(" ", "_", colnames(heart_rate)))
colnames(daily_calorie) <- tolower(gsub(" ", "_", colnames(daily_calorie)))
colnames(daily_step) <- tolower(gsub(" ", "_", colnames(daily_step)))

# Display output
colnames(daily_activity)
colnames(daily_calorie)
colnames(daily_intensity)
colnames(daily_step)
colnames(heart_rate)
colnames(sleep_data)

In [12]:
# Standardize date format
if ("activityday" %in% colnames(daily_activity)) {
  daily_activity[["activityday"]] <- as.Date(daily_activity[["activityday"]], format="%m/%d/%Y")
}
if ("activityday" %in% colnames(daily_calorie)) {
  daily_calorie[["activityday"]] <- as.Date(daily_calorie[["activityday"]], format="%m/%d/%Y")
}
if ("activityday" %in% colnames(daily_intensity)) {
  daily_intensity[["activityday"]] <- as.Date(daily_intensity[["activityday"]], format="%m/%d/%Y")
}
if ("activityday" %in% colnames(daily_step)) {
  daily_step[["activityday"]] <- as.Date(daily_step[["activityday"]], format="%m/%d/%Y")
}
if ("sleepday" %in% colnames(sleep_data)) {
  sleep_data[["activityday"]] <- as.Date(sleep_data[["sleepday"]], format="%m/%d/%Y")
}
if ("time" %in% colnames(heart_rate)) {
  heart_rate[["activityday"]] <- as.Date(heart_rate[["time"]], format="%m/%d/%Y")
}

In [13]:
# Remove duplicate entries
daily_activity <- daily_activity %>% distinct()
daily_calorie <- daily_calorie %>% distinct()
daily_intensity <- daily_intensity %>% distinct()
daily_step <- daily_step %>% distinct()
heart_rate <- heart_rate %>% distinct()
sleep_data <- sleep_data %>% distinct()

---
#### Data Integration
---

In [14]:
colnames(daily_activity)[colnames(daily_activity) == "activitydate"] <- "activityday"
colnames(daily_calorie)[colnames(daily_calorie) == "activityday"] <- "activityday"
colnames(daily_intensity)[colnames(daily_intensity) == "activityday"] <- "activityday"
colnames(daily_step)[colnames(daily_step) == "activityday"] <- "activityday"
colnames(heart_rate)[colnames(heart_rate) == "time"] <- "activityday"
colnames(sleep_data)[colnames(sleep_data) == "activityday"] <- "activityday"

# Display output
colnames(daily_activity)
colnames(daily_calorie)
colnames(daily_intensity)
colnames(daily_step)
colnames(heart_rate)
colnames(sleep_data)

In [15]:
# Last Minute Clean up

# For some reason daily_activity and heart rate activityday format is chr so have to change to date
daily_activity$activityday <- as.Date(daily_activity$activityday, format = "%m/%d/%Y")
heart_rate$activityday <- as.Date(heart_rate$activityday, format = "%m/%d/%Y")

# Rename duplicate columns in heart_rate and sleep_data
colnames(heart_rate)[which(colnames(heart_rate) == "activityday")[2]] <- "time"
colnames(sleep_data)[which(colnames(sleep_data) == "activityday")[2]] <- "activityday_sleep"

---
#### Merge data incrementally
---

In [16]:
# Merge Daily Calorie
merged_data <- daily_activity %>%
  left_join(daily_calorie, by = c("id", "activityday"))

In [17]:
# Merge Daily Intensity
merged_data <- merged_data %>%
  left_join(daily_intensity, by = c("id", "activityday"))

In [18]:
# Merge Daily Step
merged_data <- merged_data %>%
  left_join(daily_step, by = c("id", "activityday"))

In [19]:
# Merge Heart Rate
# (ignoring 'time' column here since it's a daily aggregation)
heart_rate_daily <- heart_rate %>%
  filter(!is.na(value)) %>%             # Remove NA values if there are any
  group_by(id, activityday) %>%
  summarize(avg_heart_rate = mean(value, na.rm = TRUE)) %>%
  ungroup()

# Join heart_rate after aggregation
merged_data <- merged_data %>%
  left_join(heart_rate_daily, by = c("id", "activityday"))

# Safety precaution
merged_data <- merged_data %>%
  mutate(avg_heart_rate = ifelse(is.na(avg_heart_rate), mean(avg_heart_rate, na.rm = TRUE), avg_heart_rate))

[1m[22m`summarise()` has grouped output by 'id'. You can override using the `.groups` argument.


In [20]:
# Merge Sleep Data

merged_data <- merged_data %>%
  left_join(sleep_data, by = c("id", "activityday"))

In [21]:
# Display the structure of the final integrated dataset
str(merged_data)

'data.frame':	940 obs. of  30 variables:
 $ id                        : num  1.5e+09 1.5e+09 1.5e+09 1.5e+09 1.5e+09 ...
 $ activityday               : Date, format: "2016-04-12" "2016-04-13" ...
 $ totalsteps                : int  13162 10735 10460 9762 12669 9705 13019 15506 10544 9819 ...
 $ totaldistance             : num  8.5 6.97 6.74 6.28 8.16 ...
 $ trackerdistance           : num  8.5 6.97 6.74 6.28 8.16 ...
 $ loggedactivitiesdistance  : num  0 0 0 0 0 0 0 0 0 0 ...
 $ veryactivedistance.x      : num  1.88 1.57 2.44 2.14 2.71 ...
 $ moderatelyactivedistance.x: num  0.55 0.69 0.4 1.26 0.41 ...
 $ lightactivedistance.x     : num  6.06 4.71 3.91 2.83 5.04 ...
 $ sedentaryactivedistance.x : num  0 0 0 0 0 0 0 0 0 0 ...
 $ veryactiveminutes.x       : int  25 21 30 29 36 38 42 50 28 19 ...
 $ fairlyactiveminutes.x     : int  13 19 11 34 10 20 16 31 12 8 ...
 $ lightlyactiveminutes.x    : int  328 217 181 209 221 164 233 264 205 211 ...
 $ sedentaryminutes.x        : int  728 776 12

---
#### Write the data into a new csv file for further analysis
---

In [22]:
write_csv(merged_data, "cleaned_integrated_data.csv")
cat("Data cleaning and integration complete. Cleaned data saved as 'cleaned_integrated_data.csv'.")

Data cleaning and integration complete. Cleaned data saved as 'cleaned_integrated_data.csv'.