In [1]:
from os import environ

In [2]:
pyspark_args_str = ""
pyspark_args_str += '--packages "io.delta:delta-core_2.12:1.0.0" '
pyspark_args_str += '--conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" '
pyspark_args_str += '--conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" '
pyspark_args_str += 'pyspark-shell'

In [3]:
pyspark_args_str

'--packages "io.delta:delta-core_2.12:1.0.0" --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" pyspark-shell'

In [4]:
environ['PYSPARK_SUBMIT_ARGS'] = pyspark_args_str
  
from pyspark import sql

spark = sql.SparkSession.builder \
        .master("local[8]") \
        .getOrCreate()


def display(dataframe):
    return dataframe.show()

In [5]:
spark

#### 1. Import Jan 2020 - May 2020 & late Feb 2020 data from the S3 bucket

In [6]:
from pyspark.sql.session import SparkSession
from urllib.request import urlretrieve
import time

BASE_URL = "https://hadoop-and-big-data.s3-us-west-2.amazonaws.com/fitness-tracker/"

In [7]:
def _generate_file_handles(year: int, month: int, filePath: str, is_late: bool):
    """
    if the file is late, add late to the end of the file path name, if not, return the file & filepath
    """
    late = ""
    if is_late:
        late = "_late"
    file = f"health_tracker_data_{year}_{month}{late}.json"
    
    dbfsPath = ""
    if is_late:
        dbfsPath += "late/"
    filePath += file

    return file, filePath

In [8]:
def retrieve_data(year: int, month: int, filePath: str, is_late: bool = False) -> bool:
    file, filePath = _generate_file_handles(year, month, filePath, is_late)
    uri = BASE_URL + file

    urlretrieve(uri, filePath)
    return True

In [9]:
# import january 2020 - may 2020 data
retrieve_data(2020, 1, "data/")
retrieve_data(2020, 2, "data/")
retrieve_data(2020, 3, "data/")
retrieve_data(2020, 4, "data/")
retrieve_data(2020, 5, "data/")

True

In [10]:
# import late February 2020 data
retrieve_data(2020, 2, "data/", True)

True

In [13]:
jan_df = spark.read.json("data/health_tracker_data_2020_1.json")
feb_df = spark.read.json("data/health_tracker_data_2020_2.json")
mar_df = spark.read.json("data/health_tracker_data_2020_3.json")
apr_df = spark.read.json("data/health_tracker_data_2020_4.json")
may_df = spark.read.json("data/health_tracker_data_2020_5.json")
feb_late_df = spark.read.json("data/health_tracker_data_2020_2.json")

In [14]:
from pyspark.sql.functions import lit
# add device_type column to jan 2020 data
jan_new_df = jan_df.withColumn('device_type', lit('null'))
jan_new_df = jan_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

# add device_type column to feb 2020 data 
feb_new_df = feb_df.withColumn('device_type', lit('null'))
feb_new_df = feb_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

# add device_type column to feb_late 2020 data
feb_late_new_df = feb_late_df.withColumn('device_type', lit('null'))
feb_late_new_df = feb_late_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

#### 2. Merge all datasets into one dataframe

In [20]:
df = jan_new_df.unionByName(feb_new_df).unionByName(feb_late_new_df).unionByName(mar_df).unionByName(apr_df).unionByName(may_df)
df.show()

+---------+-----------+-------------+--------------+-----------+
|device_id|device_type|    heartrate|          name|       time|
+---------+-----------+-------------+--------------+-----------+
|        0|       null|52.8139067501|Deborah Powell|1.5778368E9|
|        0|       null|53.9078900098|Deborah Powell|1.5778404E9|
|        0|       null|52.7129593616|Deborah Powell| 1.577844E9|
|        0|       null|52.2880422685|Deborah Powell|1.5778476E9|
|        0|       null|52.5156095386|Deborah Powell|1.5778512E9|
|        0|       null|53.6280743846|Deborah Powell|1.5778548E9|
|        0|       null|52.1760037066|Deborah Powell|1.5778584E9|
|        0|       null|90.0456721836|Deborah Powell| 1.577862E9|
|        0|       null|89.4695644522|Deborah Powell|1.5778656E9|
|        0|       null|88.1490304138|Deborah Powell|1.5778692E9|
|        0|       null|86.3092976213|Deborah Powell|1.5778728E9|
|        0|       null|86.6672980008|Deborah Powell|1.5778764E9|
|        0|       null|89

In [15]:
df.count()

21576

In [21]:
from pyspark.sql import functions as f
from pyspark.sql import types as t
import datetime


df = df.withColumn('timestamp', f.to_timestamp(df['time']))
df.show(2)

+---------+-----------+-------------+--------------+-----------+-------------------+
|device_id|device_type|    heartrate|          name|       time|          timestamp|
+---------+-----------+-------------+--------------+-----------+-------------------+
|        0|       null|52.8139067501|Deborah Powell|1.5778368E9|2020-01-01 00:00:00|
|        0|       null|53.9078900098|Deborah Powell|1.5778404E9|2020-01-01 01:00:00|
+---------+-----------+-------------+--------------+-----------+-------------------+
only showing top 2 rows



In [22]:
df = df.withColumn('month', f.month(df.timestamp))
df.show(2)

+---------+-----------+-------------+--------------+-----------+-------------------+-----+
|device_id|device_type|    heartrate|          name|       time|          timestamp|month|
+---------+-----------+-------------+--------------+-----------+-------------------+-----+
|        0|       null|52.8139067501|Deborah Powell|1.5778368E9|2020-01-01 00:00:00|    1|
|        0|       null|53.9078900098|Deborah Powell|1.5778404E9|2020-01-01 01:00:00|    1|
+---------+-----------+-------------+--------------+-----------+-------------------+-----+
only showing top 2 rows



#### 3. Taking a closer look at February's data, James Hou is missing data from device 4

In [24]:
# merge February data with February late data
feb_df = feb_new_df.unionByName(feb_late_new_df)
feb_df.show()

+---------+-----------+--------------+--------------+-----------+
|device_id|device_type|     heartrate|          name|       time|
+---------+-----------+--------------+--------------+-----------+
|        0|       null| 62.2867126811|Deborah Powell|1.5805152E9|
|        0|       null| 63.9665968885|Deborah Powell|1.5805188E9|
|        0|       null| 63.2519915361|Deborah Powell|1.5805224E9|
|        0|       null| 63.3466351105|Deborah Powell| 1.580526E9|
|        0|       null| 63.1314275489|Deborah Powell|1.5805296E9|
|        0|       null| 61.8974468776|Deborah Powell|1.5805332E9|
|        0|       null| 64.4935795659|Deborah Powell|1.5805368E9|
|        0|       null| 61.8257249518|Deborah Powell|1.5805404E9|
|        0|       null| 105.322579252|Deborah Powell| 1.580544E9|
|        0|       null|103.5784363662|Deborah Powell|1.5805476E9|
|        0|       null|103.8897782428|Deborah Powell|1.5805512E9|
|        0|       null|104.3893140115|Deborah Powell|1.5805548E9|
|        0

In [25]:
df.groupby("device_id").count().show()

+---------+-----+
|device_id|count|
+---------+-----+
|        0| 4344|
|        1| 4344|
|        3| 4344|
|        2| 4344|
|        4| 4200|
+---------+-----+



In [26]:
df.groupby(['month','device_id']).count().show()

+-----+---------+-----+
|month|device_id|count|
+-----+---------+-----+
|    3|        3|  744|
|    1|        0|  744|
|    5|        3|  744|
|    5|        4|  744|
|    5|        1|  744|
|    3|        4|  744|
|    4|        2|  720|
|    1|        3|  744|
|    3|        1|  744|
|    2|        2| 1392|
|    4|        0|  720|
|    1|        4|  744|
|    4|        1|  720|
|    3|        2|  744|
|    2|        4| 1248|
|    1|        1|  744|
|    2|        1| 1392|
|    4|        4|  720|
|    3|        0|  744|
|    2|        3| 1392|
+-----+---------+-----+
only showing top 20 rows



In [27]:
feb_df.groupBy("device_id").count().show()

+---------+-----+
|device_id|count|
+---------+-----+
|        0| 1392|
|        1| 1392|
|        3| 1392|
|        2| 1392|
|        4| 1248|
+---------+-----+



In [28]:
feb_df.groupBy("name").count().show()

+--------------+-----+
|          name|count|
+--------------+-----+
|     Sam Knopp| 1392|
|     James Hou| 1248|
|   Minh Nguyen| 1392|
|Kristin Vasser| 1392|
|Deborah Powell| 1392|
+--------------+-----+



#### Heart Rate Analysis: I wanted to look at the heart rate ranges to see if data was ommitted but feb data is consistent with the remainder of the dataset

In [29]:
no_feb = df.filter(df['month'] != 2)
no_feb.groupBy("device_id") \
    .agg(f.min("heartrate").alias("min_heartrate"), \
         f.max("heartrate").alias("max_heartrate"), \
    ) \
    .show(truncate=False)

+---------+---------------+--------------+
|device_id|min_heartrate  |max_heartrate |
+---------+---------------+--------------+
|0        |-395.1306033635|186.4790827731|
|1        |-226.4962764909|186.2081869555|
|3        |-263.5045292615|218.7730128322|
|2        |-323.6580551676|184.7433209566|
|4        |-310.6998377211|185.2604814742|
+---------+---------------+--------------+



In [30]:
feb_df.groupBy("device_id") \
    .agg(f.min("heartrate").alias("min_heartrate"), \
         f.max("heartrate").alias("max_heartrate"), \
    ) \
    .show(truncate=False)

+---------+---------------+--------------+
|device_id|min_heartrate  |max_heartrate |
+---------+---------------+--------------+
|0        |-218.062685331 |170.8754616576|
|1        |-289.9324513412|165.8129031814|
|3        |-179.6888893893|175.0032148522|
|2        |-190.2847838357|189.2113455089|
|4        |-138.2444236427|199.092971234 |
+---------+---------------+--------------+



#### Timestamp Analysis: After taking a more in-depth look at the timestamp data for February, for device 4, there is missing data for the last 3 days of the mont

In [31]:
no_feb_device_4 = df.filter(df.month == 2)
no_feb_device_4.groupBy("device_id") \
    .agg(f.min("timestamp").alias("min_heartrate"), \
         f.max("timestamp").alias("max_heartrate"), \
    ) \
    .show(truncate=False)

+---------+-------------------+-------------------+
|device_id|min_heartrate      |max_heartrate      |
+---------+-------------------+-------------------+
|0        |2020-02-01 00:00:00|2020-02-29 23:00:00|
|1        |2020-02-01 00:00:00|2020-02-29 23:00:00|
|3        |2020-02-01 00:00:00|2020-02-29 23:00:00|
|2        |2020-02-01 00:00:00|2020-02-29 23:00:00|
|4        |2020-02-01 00:00:00|2020-02-26 23:00:00|
+---------+-------------------+-------------------+



In [45]:
# convert pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [35]:
df1 = df.toPandas()
df1.head()

Unnamed: 0,device_id,device_type,heartrate,name,time,timestamp,month
0,0,,52.813907,Deborah Powell,1577837000.0,2020-01-01 00:00:00,1
1,0,,53.90789,Deborah Powell,1577840000.0,2020-01-01 01:00:00,1
2,0,,52.712959,Deborah Powell,1577844000.0,2020-01-01 02:00:00,1
3,0,,52.288042,Deborah Powell,1577848000.0,2020-01-01 03:00:00,1
4,0,,52.51561,Deborah Powell,1577851000.0,2020-01-01 04:00:00,1


In [39]:
jan_pd_df = jan_new_df.toPandas()