In [77]:
from delta.tables import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, min, asc, desc, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DateType, DoubleType, ArrayType

In [78]:
spark = (
    SparkSession.builder
    .appName("spark-s3-delta")
    .config(
        "spark.jars.packages",
        ",".join([
            "io.delta:delta-spark_2.12:3.1.0",
            "org.apache.hadoop:hadoop-aws:3.3.4",
            "com.amazonaws:aws-java-sdk-bundle:1.12.262"
        ])
    )
    .config(
        "spark.sql.extensions",
        "io.delta.sql.DeltaSparkSessionExtension"
    )
    .config(
        "spark.sql.catalog.spark_catalog",
        "org.apache.spark.sql.delta.catalog.DeltaCatalog"
    )
    .getOrCreate()
)

hadoop_conf = spark._jsc.hadoopConfiguration()

hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
hadoop_conf.set(
    "fs.s3a.aws.credentials.provider",
    "com.amazonaws.auth.DefaultAWSCredentialsProviderChain"
)

hadoop_conf.set("fs.s3a.path.style.access", "true")

In [79]:
# Schema definition

region_schema = StructType([
    StructField('cities', ArrayType(StringType()), nullable = True),
    StructField('iso', StringType(), nullable = True),
    StructField('lat', StringType(), nullable = True),
    StructField('long', StringType(), nullable = True),
    StructField('name', StringType(), nullable = True),
    StructField('province', StringType(), nullable = True)
])

raw_schema = StructType([
    StructField('active', IntegerType(), nullable = True),
    StructField('active_diff', IntegerType(), nullable = True),
    StructField('confirmed', IntegerType(), nullable = True),
    StructField('confirmed_diff', IntegerType(), nullable = True),
    StructField('date', DateType(), nullable = True),
    StructField('deaths', IntegerType(), nullable = True),
    StructField('deaths_diff', IntegerType(), nullable = True),
    StructField('fatality_rate', DoubleType(), nullable = True),
    StructField('last_update', TimestampType(), nullable = True),
    StructField('recovered', IntegerType(), nullable = True),
    StructField('recovered_diff', IntegerType(), nullable = True),
    StructField('region', region_schema, nullable = True),
    StructField('ingested_ts', TimestampType(), nullable = False),
    StructField('source_file_index', StringType(), nullable = False)
])

target_schema = StructType([
    StructField('active', IntegerType(), nullable = True),
    StructField('active_diff', IntegerType(), nullable = True),
    StructField('confirmed', IntegerType(), nullable = True),
    StructField('confirmed_diff', IntegerType(), nullable = True),
    StructField('date', DateType(), nullable = True),
    StructField('deaths', IntegerType(), nullable = True),
    StructField('deaths_diff', IntegerType(), nullable = True),
    StructField('fatality_rate', DoubleType(), nullable = True),
    StructField('last_update', TimestampType(), nullable = True),
    StructField('recovered', IntegerType(), nullable = True),
    StructField('recovered_diff', IntegerType(), nullable = True),
    StructField('cities', ArrayType(StringType()), nullable = True),
    StructField('iso', StringType(), nullable = True),
    StructField('lat', DoubleType(), nullable = True),
    StructField('long', DoubleType(), nullable = True),
    StructField('name', StringType(), nullable = True),
    StructField('province', StringType(), nullable = True),
    StructField('ingested_ts', TimestampType(), nullable = False),
    StructField('source_file_index', StringType(), nullable = False),
    StructField('created_ts', TimestampType(), nullable=False),
    StructField('updated_ts', TimestampType(), nullable=False)
])


In [80]:
raw_aws_path = 's3a://vd-airflow-docker-bucket/covid/Canada/raw'

curated_aws_path = 's3a://vd-airflow-docker-bucket/covid/Canada/curated'

In [81]:
# Flattening raw data. Region --> cities, iso, lat, long, name, province

raw_df = spark.read.schema(raw_schema).json(raw_aws_path)
# raw_df.printSchema()

df_flattened = (
                raw_df.select(
                col('active').alias('active'),
                col('active_diff').alias('active_diff'),
                col('confirmed').alias('confirmed'),
                col('confirmed_diff').alias('confirmed_diff'),
                col('date').alias('date'),
                col('deaths').alias('deaths'),
                col('deaths_diff').alias('deaths_diff'),
                col('fatality_rate').alias('fatality_rate'),
                col('last_update').alias('last_update'),
                col('recovered').alias('recovered'),
                col('recovered_diff').alias('recovered_diff'),
                col('region.cities').alias('cities'), 
                col('region.iso').alias('iso'),
                col('region.lat').alias('lat').cast('double'),  # Cast to correct type
                col('region.long').alias('long').cast('double'), 
                col('region.name').alias('name'), 
                col('region.province').alias('province'),
                col('ingested_ts').alias('ingested_ts'),
                col('source_file_index').alias('source_file_index')
                )                                                   
                # Add metadata timestamp
                .withColumn('created_ts', current_timestamp())         
                .withColumn('updated_ts', current_timestamp()) 
                # Set date for full load
                .filter(col('date')<'2020-02-01')                             
                )
df_flattened.orderBy(desc('ingested_ts'), desc('date')).show(10, truncate=False)

+------+-----------+---------+--------------+----------+------+-----------+-------------+-------------------+---------+--------------+------+---+-------+---------+------+----------------+-------------------+----------------------------------------------+--------------------------+--------------------------+
|active|active_diff|confirmed|confirmed_diff|date      |deaths|deaths_diff|fatality_rate|last_update        |recovered|recovered_diff|cities|iso|lat    |long     |name  |province        |ingested_ts        |source_file_index                             |created_ts                |updated_ts                |
+------+-----------+---------+--------------+----------+------+-----------+-------------+-------------------+---------+--------------+------+---+-------+---------+------+----------------+-------------------+----------------------------------------------+--------------------------+--------------------------+
|2     |0          |2        |0             |2020-01-31|0     |0         

In [83]:
# Write to Silver Layer (curated) as a Delta Table

df_flattened.write.format('delta').option('overwriteSchema', 'true').mode('overwrite').save(curated_aws_path)

In [84]:
df = spark.read.format('delta').load(curated_aws_path)

df.show(10)

+------+-----------+---------+--------------+----------+------+-----------+-------------+-------------------+---------+--------------+------+---+-------+---------+------+----------------+-------------------+--------------------+--------------------+--------------------+
|active|active_diff|confirmed|confirmed_diff|      date|deaths|deaths_diff|fatality_rate|        last_update|recovered|recovered_diff|cities|iso|    lat|     long|  name|        province|        ingested_ts|   source_file_index|          created_ts|          updated_ts|
+------+-----------+---------+--------------+----------+------+-----------+-------------+-------------------+---------+--------------+------+---+-------+---------+------+----------------+-------------------+--------------------+--------------------+--------------------+
|     1|          0|        1|             0|2020-01-28|     0|          0|          0.0|2020-01-28 23:00:00|        0|             0|    []|CAN|51.2538| -85.3232|Canada|         Ontario|