In [None]:
from delta.table import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, min, asc, desc
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DateType, DoubleType, ArrayType

In [3]:
spark = (
    SparkSession.builder
    .appName("spark-s3")
    .getOrCreate()
)

hadoop_conf = spark._jsc.hadoopConfiguration()

hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set(
    "fs.s3a.aws.credentials.provider",
    "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
)

hadoop_conf.set("fs.s3a.path.style.access", "true")

In [None]:
raw_aws_path = 's3a://vd-airflow-docker-bucket/covid/Canada'

curated_aws_path = 's3a://vd-airflow-docker-bucket/covid/Canada/curated'

In [97]:
# Schema definition

region_schema = StructType([
    StructField('cities', ArrayType(StringType()), nullable = True),
    StructField('iso', StringType(), nullable = True),
    StructField('lat', StringType(), nullable = True),
    StructField('long', StringType(), nullable = True),
    StructField('name', StringType(), nullable = True),
    StructField('province', StringType(), nullable = True)
])

raw_schema = StructType([
    StructField('active', IntegerType(), nullable = True),
    StructField('active_diff', IntegerType(), nullable = True),
    StructField('confirmed', IntegerType(), nullable = True),
    StructField('confirmed_diff', IntegerType(), nullable = True),
    StructField('date', DateType(), nullable = True),
    StructField('deaths', IntegerType(), nullable = True),
    StructField('deaths_diff', IntegerType(), nullable = True),
    StructField('fatality_rate', DoubleType(), nullable = True),
    StructField('last_updated', IntegerType(), nullable = True),
    StructField('recovered', IntegerType(), nullable = True),
    StructField('recovered_diff', IntegerType(), nullable = True),
    StructField('region', region_schema, nullable = True)
])

target_schema = StructType([
    StructField('active', IntegerType(), nullable = True),
    StructField('active_diff', IntegerType(), nullable = True),
    StructField('confirmed', IntegerType(), nullable = True),
    StructField('confirmed_diff', IntegerType(), nullable = True),
    StructField('date', DateType(), nullable = True),
    StructField('deaths', IntegerType(), nullable = True),
    StructField('deaths_diff', IntegerType(), nullable = True),
    StructField('fatality_rate', DoubleType(), nullable = True),
    StructField('last_updated', IntegerType(), nullable = True),
    StructField('recovered', IntegerType(), nullable = True),
    StructField('recovered_diff', IntegerType(), nullable = True),
    StructField('cities', ArrayType(StringType()), nullable = True),
    StructField('iso', DoubleType(), nullable = True),
    StructField('lat', DoubleType(), nullable = True),
    StructField('long', StringType(), nullable = True),
    StructField('name', StringType(), nullable = True),
    StructField('province', StringType(), nullable = True)
])


In [87]:
raw_df = (spark
        .read
        .schema(raw_schema)
        .json(raw_aws_path)
)

In [98]:
# This will be used for full or incremental loading

def is_curated_exists(curated_aws_path: str) -> bool:
    try:
        target_df = spark.read.json(curated_aws_path)
        return True
    except Exception:
        return False
    
print(is_curated_exists(curated_aws_path))

False


In [99]:
# Set date for incremental loading

if is_curated_exists(curated_aws_path) == False:
    date = '2020-01-01'

    # Create table if not exists
    empty_target_df = spark.createDataFrame([], schema=target_schema)
    empty_target_df.write.mode('overwrite').save(curated_aws_path)

else:
    df = spark.read.json(curated_aws_path)
    date = df.select('date').agg(max('date')).collect()[0][0]

print(date)
    

2020-01-01


In [101]:
empty_target_df.show()

+------+-----------+---------+--------------+----+------+-----------+-------------+------------+---------+--------------+------+---+---+----+----+--------+
|active|active_diff|confirmed|confirmed_diff|date|deaths|deaths_diff|fatality_rate|last_updated|recovered|recovered_diff|cities|iso|lat|long|name|province|
+------+-----------+---------+--------------+----+------+-----------+-------------+------------+---------+--------------+------+---+---+----+----+--------+
+------+-----------+---------+--------------+----+------+-----------+-------------+------------+---------+--------------+------+---+---+----+----+--------+



In [102]:
# Flattening raw data. Region --> cities, iso, lat, long, name, province

df_flattened = raw_df                                               \
                .select(
                col('active').alias('active'),
                col('active_diff').alias('active_diff'),
                col('confirmed').alias('confirmed'),
                col('confirmed_diff').alias('confirmed_diff'),
                col('date').alias('date'),
                col('deaths').alias('deaths'),
                col('deaths_diff').alias('deaths_diff'),
                col('fatality_rate').alias('fatality_rate'),
                # col('last_update').alias('last_update'),
                col('recovered').alias('recovered'),
                col('recovered_diff').alias('recovered_diff'),
                col('region.cities').alias('cities'), 
                col('region.iso').alias('iso').cast('double'),              # Cast to correct type
                col('region.lat').alias('lat').cast('double'), 
                col('region.long').alias('long'), 
                col('region.name').alias('name'), 
                col('region.province').alias('province')
                )                                                   \
                .filter(col('date')>date)

In [104]:
df_flattened \
.mergeInto(empty_target_df, 'date') \
.whenMatched().update({ 'date': df_flattened.date}) \
.whenNotMatched().insertAll() \
.merge()

AttributeError: 'DataFrame' object has no attribute 'mergeInto'