In [2]:
from os import environ

In [3]:
pyspark_args_str = ""
pyspark_args_str += '--packages "io.delta:delta-core_2.12:1.0.0" '
pyspark_args_str += '--conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" '
pyspark_args_str += '--conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" '
pyspark_args_str += 'pyspark-shell'

In [4]:
pyspark_args_str

'--packages "io.delta:delta-core_2.12:1.0.0" --conf "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension" --conf "spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog" pyspark-shell'

In [5]:
environ['PYSPARK_SUBMIT_ARGS'] = pyspark_args_str
  
from pyspark import sql

spark = sql.SparkSession.builder \
        .master("local[8]") \
        .getOrCreate()


def display(dataframe):
    return dataframe.show()

In [6]:
spark

#### 1. Import Jan 2020 - May 2020 & late Feb 2020 data from S3 bucket

In [7]:
from pyspark.sql.session import SparkSession
from urllib.request import urlretrieve
import time

BASE_URL = "https://hadoop-and-big-data.s3-us-west-2.amazonaws.com/fitness-tracker/"

In [9]:
def _generate_file_handles(year: int, month: int, filePath: str, is_late: bool):
    """
    if the file is late, add late to the end of the file path name, if not, return the file & filepath
    """
    late = ""
    if is_late:
        late = "_late"
    file = f"health_tracker_data_{year}_{month}{late}.json"
    
    dbfsPath = ""
    if is_late:
        dbfsPath += "late/"
    filePath += file

    return file, filePath

In [10]:
def retrieve_data(year: int, month: int, filePath: str, is_late: bool = False) -> bool:
    file, filePath = _generate_file_handles(year, month, filePath, is_late)
    uri = BASE_URL + file

    urlretrieve(uri, filePath)
    return True

In [51]:
ls

Fitness_Tracker.ipynb                 health_tracker_data_2020_4.json
health_tracker_data_2020_1.json       health_tracker_data_2020_5.json
health_tracker_data_2020_2.json       [0m[01;34mlate[0m/
health_tracker_data_2020_2_late.json  latehealth_tracker_data_2020_2_late.json
health_tracker_data_2020_3.json       S3_Spark_Notes.ipynb


In [53]:
jan_df = spark.read.json("health_tracker_data_2020_1.json")
feb_df = spark.read.json("health_tracker_data_2020_2.json")
mar_df = spark.read.json("health_tracker_data_2020_3.json")
apr_df = spark.read.json("health_tracker_data_2020_4.json")
may_df = spark.read.json("health_tracker_data_2020_5.json")

In [69]:
# read late json file
feb_late_df = spark.read.json("latehealth_tracker_data_2020_2_late.json")

In [71]:
# view column names of dataframes
jan_df.show(1)
feb_df.show(1)
mar_df.show(1)
apr_df.show(1)
may_df.show(1)
feb_late_df.show(1)

+---------+-------------+--------------+-----------+
|device_id|    heartrate|          name|       time|
+---------+-------------+--------------+-----------+
|        0|52.8139067501|Deborah Powell|1.5778368E9|
+---------+-------------+--------------+-----------+
only showing top 1 row

+---------+-------------+--------------+-----------+
|device_id|    heartrate|          name|       time|
+---------+-------------+--------------+-----------+
|        0|62.2867126811|Deborah Powell|1.5805152E9|
+---------+-------------+--------------+-----------+
only showing top 1 row

+---------+-----------+-------------+--------------+-----------+
|device_id|device_type|    heartrate|          name|       time|
+---------+-----------+-------------+--------------+-----------+
|        0|  version 2|57.6447293596|Deborah Powell|1.5830208E9|
+---------+-----------+-------------+--------------+-----------+
only showing top 1 row

+---------+-----------+------------+--------------+-----------+
|device_i

In [73]:
from pyspark.sql.functions import lit
# add device_type column to jan 2020 data
jan_new_df = jan_df.withColumn('device_type', lit('null'))
jan_new_df = jan_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

# add device_type column to feb 2020 data 
feb_new_df = feb_df.withColumn('device_type', lit('null'))
feb_new_df = feb_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

# add device_type column to feb_late 2020 data
feb_late_new_df = feb_late_df.withColumn('device_type', lit('null'))
feb_late_new_df = feb_late_new_df.select('device_id','device_type','heartrate', 'name', 'time' )

In [75]:
from functools import reduce
from pyspark.sql import DataFrame
df_list = [jan_new_df, feb_new_df, feb_late_new_df, mar_df, apr_df, may_df]
df = reduce(DataFrame.unionAll, df_list)

In [154]:
from pyspark.sql import functions as f
from pyspark.sql import types as t
import datetime

In [181]:
df = df.withColumn('timestamp', f.to_timestamp(df['time']))
df.show(2)

+---------+-----------+-------------+--------------+-----------+-------------------+-----+
|device_id|device_type|    heartrate|          name|       time|          timestamp|month|
+---------+-----------+-------------+--------------+-----------+-------------------+-----+
|        0|       null|52.8139067501|Deborah Powell|1.5778368E9|2020-01-01 00:00:00|    1|
|        0|       null|53.9078900098|Deborah Powell|1.5778404E9|2020-01-01 01:00:00|    1|
+---------+-----------+-------------+--------------+-----------+-------------------+-----+
only showing top 2 rows



In [182]:
df = df.withColumn('month', month(df.timestamp))
df.show(2)

+---------+-----------+-------------+--------------+-----------+-------------------+-----+
|device_id|device_type|    heartrate|          name|       time|          timestamp|month|
+---------+-----------+-------------+--------------+-----------+-------------------+-----+
|        0|       null|52.8139067501|Deborah Powell|1.5778368E9|2020-01-01 00:00:00|    1|
|        0|       null|53.9078900098|Deborah Powell|1.5778404E9|2020-01-01 01:00:00|    1|
+---------+-----------+-------------+--------------+-----------+-------------------+-----+
only showing top 2 rows



In [200]:
df.groupBy('month').count('device_id').show()

TypeError: _api() takes 1 positional argument but 2 were given