In [1]:
from os import environ

In [2]:
pyspark_args_str = ""
pyspark_args_str += '--packages "io.delta:delta-core_2.12:1.0.0" '
pyspark_args_str += '--conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" '
pyspark_args_str += '--conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" '
pyspark_args_str += 'pyspark-shell'

In [3]:
pyspark_args_str

'--packages "io.delta:delta-core_2.12:1.0.0" --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" pyspark-shell'

In [4]:
environ['PYSPARK_SUBMIT_ARGS'] = pyspark_args_str
  
from pyspark import sql

spark = sql.SparkSession.builder \
        .master("local[8]") \
        .getOrCreate()


def display(dataframe):
    return dataframe.show()

In [5]:
spark

#### 1. Import Jan 2020 - May 2020 & late Feb 2020 data from the S3 bucket

In [6]:
from pyspark.sql.session import SparkSession
from urllib.request import urlretrieve
import time

BASE_URL = "https://hadoop-and-big-data.s3-us-west-2.amazonaws.com/fitness-tracker/"

In [7]:
def _generate_file_handles(year: int, month: int, filePath: str, is_late: bool):
    """
    if the file is late, add late to the end of the file path name, if not, return the file & filepath
    """
    late = ""
    if is_late:
        late = "_late"
    file = f"health_tracker_data_{year}_{month}{late}.json"
    
    dbfsPath = ""
    if is_late:
        dbfsPath += "late/"
    filePath += file

    return file, filePath

In [8]:
def retrieve_data(year: int, month: int, filePath: str, is_late: bool = False) -> bool:
    file, filePath = _generate_file_handles(year, month, filePath, is_late)
    uri = BASE_URL + file

    urlretrieve(uri, filePath)
    return True

In [9]:
# import january 2020 - may 2020 data
retrieve_data(2020, 1, "data/")
retrieve_data(2020, 2, "data/")
retrieve_data(2020, 3, "data/")
retrieve_data(2020, 4, "data/")
retrieve_data(2020, 5, "data/")

True

In [10]:
# import late February 2020 data
retrieve_data(2020, 2, "data/", True)

True

In [11]:
jan_df = spark.read.json("data/health_tracker_data_2020_1.json")
feb_df = spark.read.json("data/health_tracker_data_2020_2.json")
mar_df = spark.read.json("data/health_tracker_data_2020_3.json")
apr_df = spark.read.json("data/health_tracker_data_2020_4.json")
may_df = spark.read.json("data/health_tracker_data_2020_5.json")

In [12]:
feb_late_df = spark.read.json("data/health_tracker_data_2020_2.json")

In [13]:
from pyspark.sql.functions import lit
# add device_type column to jan 2020 data
jan_new_df = jan_df.withColumn('device_type', lit('null'))
jan_new_df = jan_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

# add device_type column to feb 2020 data 
feb_new_df = feb_df.withColumn('device_type', lit('null'))
feb_new_df = feb_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

# add device_type column to feb_late 2020 data
feb_late_new_df = feb_late_df.withColumn('device_type', lit('null'))
feb_late_new_df = feb_late_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

#### 2. Merge all datasets into one dataframe

In [14]:
from functools import reduce
from pyspark.sql import DataFrame
df_list = [jan_new_df, feb_new_df, feb_late_new_df, mar_df, apr_df, may_df]
df = reduce(DataFrame.unionAll, df_list)

In [15]:
df.count()

21576

In [47]:
from pyspark.sql import functions as f
from pyspark.sql import types as t
import datetime


df = df.withColumn('timestamp', f.to_timestamp(df['time']))
df.show(2)

+---------+-----------+-------------+--------------+-----------+-------------------+
|device_id|device_type|    heartrate|          name|       time|          timestamp|
+---------+-----------+-------------+--------------+-----------+-------------------+
|        0|       null|52.8139067501|Deborah Powell|1.5778368E9|2020-01-01 00:00:00|
|        0|       null|53.9078900098|Deborah Powell|1.5778404E9|2020-01-01 01:00:00|
+---------+-----------+-------------+--------------+-----------+-------------------+
only showing top 2 rows



In [52]:
df = df.withColumn('month', f.month(df.timestamp))
df.show(2)

+---------+-----------+-------------+--------------+-----------+-------------------+-----+
|device_id|device_type|    heartrate|          name|       time|          timestamp|month|
+---------+-----------+-------------+--------------+-----------+-------------------+-----+
|        0|       null|52.8139067501|Deborah Powell|1.5778368E9|2020-01-01 00:00:00|    1|
|        0|       null|53.9078900098|Deborah Powell|1.5778404E9|2020-01-01 01:00:00|    1|
+---------+-----------+-------------+--------------+-----------+-------------------+-----+
only showing top 2 rows



#### 3. Taking a closer look at February's data, James Hou is missing data from device 4

In [82]:
all_feb_list = [feb_new_df, feb_late_new_df]
all_feb = reduce(DataFrame.unionAll, all_feb_list)

In [64]:
df.groupby("device_id").count().show()

+---------+-----+
|device_id|count|
+---------+-----+
|        0| 4344|
|        1| 4344|
|        3| 4344|
|        2| 4344|
|        4| 4200|
+---------+-----+



In [81]:
df.groupby(['month','device_id']).count().show()

+-----+---------+-----+
|month|device_id|count|
+-----+---------+-----+
|    3|        3|  744|
|    1|        0|  744|
|    5|        3|  744|
|    5|        4|  744|
|    5|        1|  744|
|    3|        4|  744|
|    4|        2|  720|
|    1|        3|  744|
|    3|        1|  744|
|    2|        2| 1392|
|    4|        0|  720|
|    1|        4|  744|
|    4|        1|  720|
|    3|        2|  744|
|    2|        4| 1248|
|    1|        1|  744|
|    2|        1| 1392|
|    4|        4|  720|
|    3|        0|  744|
|    2|        3| 1392|
+-----+---------+-----+
only showing top 20 rows



In [83]:
all_feb.groupBy("device_id").count().show()

+---------+-----+
|device_id|count|
+---------+-----+
|        0| 1392|
|        1| 1392|
|        3| 1392|
|        2| 1392|
|        4| 1248|
+---------+-----+



In [84]:
all_feb.groupBy("name").count().show()

+--------------+-----+
|          name|count|
+--------------+-----+
|     Sam Knopp| 1392|
|     James Hou| 1248|
|   Minh Nguyen| 1392|
|Kristin Vasser| 1392|
|Deborah Powell| 1392|
+--------------+-----+



#### Heartrate Analysis: I wanted to look at the heartrate ranges to see if data was ommitted but feb data is consistent with the remainder of the dataset

In [88]:
no_feb = df.filter(df['month'] != 2)
no_feb.groupBy("device_id") \
    .agg(f.min("heartrate").alias("min_heartrate"), \
         f.max("heartrate").alias("max_heartrate"), \
    ) \
    .show(truncate=False)

+---------+---------------+--------------+
|device_id|min_heartrate  |max_heartrate |
+---------+---------------+--------------+
|0        |-395.1306033635|186.4790827731|
|1        |-226.4962764909|186.2081869555|
|3        |-263.5045292615|218.7730128322|
|2        |-323.6580551676|184.7433209566|
|4        |-310.6998377211|185.2604814742|
+---------+---------------+--------------+



In [89]:
all_feb.groupBy("device_id") \
    .agg(f.min("heartrate").alias("min_heartrate"), \
         f.max("heartrate").alias("max_heartrate"), \
    ) \
    .show(truncate=False)

+---------+---------------+--------------+
|device_id|min_heartrate  |max_heartrate |
+---------+---------------+--------------+
|0        |-218.062685331 |170.8754616576|
|1        |-289.9324513412|165.8129031814|
|3        |-179.6888893893|175.0032148522|
|2        |-190.2847838357|189.2113455089|
|4        |-138.2444236427|199.092971234 |
+---------+---------------+--------------+



#### Timestamp Analysis: After taking a more in-depth look at the timestamp data for February, for device 4, there is missing data for the last 3 days of the mont

In [108]:
no_feb_device_4 = df.filter(df.month == 2)
no_feb_device_4.groupBy("device_id") \
    .agg(f.min("timestamp").alias("min_heartrate"), \
         f.max("timestamp").alias("max_heartrate"), \
    ) \
    .show(truncate=False)

+---------+-------------------+-------------------+
|device_id|min_heartrate      |max_heartrate      |
+---------+-------------------+-------------------+
|0        |2020-02-01 00:00:00|2020-02-29 23:00:00|
|1        |2020-02-01 00:00:00|2020-02-29 23:00:00|
|3        |2020-02-01 00:00:00|2020-02-29 23:00:00|
|2        |2020-02-01 00:00:00|2020-02-29 23:00:00|
|4        |2020-02-01 00:00:00|2020-02-26 23:00:00|
+---------+-------------------+-------------------+

