In [1]:
from os import environ

In [2]:
pyspark_args_str = ""
pyspark_args_str += '--packages "io.delta:delta-core_2.12:1.0.0" '
pyspark_args_str += '--conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" '
pyspark_args_str += '--conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" '
pyspark_args_str += 'pyspark-shell'

In [3]:
pyspark_args_str

'--packages "io.delta:delta-core_2.12:1.0.0" --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" pyspark-shell'

In [4]:
environ['PYSPARK_SUBMIT_ARGS'] = pyspark_args_str
  
from pyspark import sql

spark = sql.SparkSession.builder \
        .master("local[8]") \
        .getOrCreate()


def display(dataframe):
    return dataframe.show()

In [5]:
spark

#### 1. Import Jan 2020 - May 2020 & late Feb 2020 data from S3 bucket

In [6]:
from pyspark.sql.session import SparkSession
from urllib.request import urlretrieve
import time

BASE_URL = "https://hadoop-and-big-data.s3-us-west-2.amazonaws.com/fitness-tracker/"

In [33]:
def _generate_file_handles(year: int, month: int, filePath: str, is_late: bool):
    """
    if the file is late, add late to the end of the file path name, if not, return the file & filepath
    """
    late = ""
    if is_late:
        late = "_late"
    file = f"health_tracker_data_{year}_{month}{late}.json"
    
    dbfsPath = ""
    if is_late:
        dbfsPath += "late/"
    filePath += file

    return file, filePath

In [35]:
def retrieve_data(year: int, month: int, filePath: str, is_late: bool = False) -> bool:
    file, filePath = _generate_file_handles(year, month, filePath, is_late)
    uri = BASE_URL + file

    urlretrieve(uri, filePath)
    return True

In [29]:
# import january 2020 - may 2020 data
retrieve_data(2020, 1, "data/")
retrieve_data(2020, 2, "data/")
retrieve_data(2020, 3, "data/")
retrieve_data(2020, 4, "data/")
retrieve_data(2020, 5, "data/")

True

In [39]:
# import late February 2020 data
retrieve_data(2020, 2, "data/", True)

True

In [43]:
jan_df = spark.read.json("data/health_tracker_data_2020_1.json")
feb_df = spark.read.json("data/health_tracker_data_2020_2.json")
mar_df = spark.read.json("data/health_tracker_data_2020_3.json")
apr_df = spark.read.json("data/health_tracker_data_2020_4.json")
may_df = spark.read.json("data/health_tracker_data_2020_5.json")

In [38]:
feb_late_df = spark.read.json("data/health_tracker_data_2020_2.json")

In [65]:
from pyspark.sql.functions import lit
# add device_type column to jan 2020 data
jan_new_df = jan_df.withColumn('device_type', lit('null'))
jan_new_df = jan_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

# add device_type column to feb 2020 data 
feb_new_df = feb_df.withColumn('device_type', lit('null'))
feb_new_df = feb_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

# add device_type column to feb_late 2020 data
feb_late_new_df = feb_late_df.withColumn('device_type', lit('null'))
feb_late_new_df = feb_late_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

In [92]:
from functools import reduce
from pyspark.sql import DataFrame
df_list = [jan_new_df, feb_new_df, feb_late_new_df, mar_df, apr_df, may_df]
df = reduce(DataFrame.unionAll, df_list)

In [93]:
df.count()

21576

In [114]:
feb_df.groupBy("device_id").count().show()

+---------+-----+
|device_id|count|
+---------+-----+
|        0|  696|
|        1|  696|
|        3|  696|
|        2|  696|
|        4|  624|
+---------+-----+



In [113]:
feb_df.groupBy("name").count().show()

+--------------+-----+
|          name|count|
+--------------+-----+
|     Sam Knopp|  696|
|     James Hou|  624|
|   Minh Nguyen|  696|
|Kristin Vasser|  696|
|Deborah Powell|  696|
+--------------+-----+



In [118]:
df.show()

+---------+-----------+-------------+--------------+-----------+
|device_id|device_type|    heartrate|          name|       time|
+---------+-----------+-------------+--------------+-----------+
|        0|       null|52.8139067501|Deborah Powell|1.5778368E9|
|        0|       null|53.9078900098|Deborah Powell|1.5778404E9|
|        0|       null|52.7129593616|Deborah Powell| 1.577844E9|
|        0|       null|52.2880422685|Deborah Powell|1.5778476E9|
|        0|       null|52.5156095386|Deborah Powell|1.5778512E9|
|        0|       null|53.6280743846|Deborah Powell|1.5778548E9|
|        0|       null|52.1760037066|Deborah Powell|1.5778584E9|
|        0|       null|90.0456721836|Deborah Powell| 1.577862E9|
|        0|       null|89.4695644522|Deborah Powell|1.5778656E9|
|        0|       null|88.1490304138|Deborah Powell|1.5778692E9|
|        0|       null|86.3092976213|Deborah Powell|1.5778728E9|
|        0|       null|86.6672980008|Deborah Powell|1.5778764E9|
|        0|       null|89