### 1. Set-up the configs using Azure key vaults and Databricks secrets scope
### 2. Transform testing data
#### Please update the following 
- client_id
- tenant_id
- client_secret

In [0]:
client_id = dbutils.secrets.get(scope="covid19-scope", key="covid-19-client-id")
tenant_id = dbutils.secrets.get(scope="covid19-scope", key="covid-19-tenant-id")
client_secret = dbutils.secrets.get(scope="covid19-scope", key="covid-19-client-secrets")

In [0]:
configs = {"fs.azure.account.auth.type": "OAuth",
           "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
           "fs.azure.account.oauth2.client.id": client_id,
           "fs.azure.account.oauth2.client.secret": client_secret,
           "fs.azure.account.oauth2.client.endpoint": f"https://login.microsoftonline.com/{tenant_id}/oauth2/token"}

for key, value in configs.items(): spark.conf.set(key, value)

In [0]:
from pyspark.sql.functions import *

In [0]:
df_raw_testing = spark.read.option("header", "true") \
               .csv("abfss://raw@covidreportdl712.dfs.core.windows.net/ecdc/testing.csv")
               
df_raw_testing.createOrReplaceTempView("df_raw_testing")

In [0]:
# Create a data frame for the country lookup
df_dim_country = spark.read.csv("abfss://lookup@covidreportdl712.dfs.core.windows.net/country_lookup.csv", sep=r',', header=True)
df_dim_country.createOrReplaceTempView("dim_country")

In [0]:
df_testing_country_lookup = spark.sql("""
SELECT 
    c.country,
    c.country_code_2_digit,
    c.country_code_3_digit,
    t.year_week,
    t.new_cases,
    t.tests_done,
    t.population,
    cast(t.testing_rate AS FLOAT) as testing_rate ,
    cast(t.positivity_rate AS FLOAT) as positivity_rate,
    t.testing_data_source
FROM df_raw_testing t
JOIN dim_country c 
    ON t.country_code = c.country_code_2_digit
ORDER BY c.country
""")
df_testing_country_lookup.createOrReplaceTempView("df_testing_country_lookup")

In [0]:
# Create a data frame for the dim_date
df_dim_date = spark.read.csv("abfss://lookup@covidreportdl712.dfs.core.windows.net/dim_date.csv", sep=r',', header=True)


In [0]:
df = (
    df_dim_date
    .withColumn(
        "year_week",
        concat(
            col("year"),
            lit("-W"),
            lpad(col("week_of_year").cast("string"), 2, "0")
        )
    )
    .select(
        col("date"),
        col("year_week")
    )
)


In [0]:
# Step 1: Aggregate to get min and max date per week
df_week_bounds = (
    df.groupBy("year_week")
      .agg(
          min("date").alias("week_start_date"),
          max("date").alias("week_end_date")
      )
)

df_week_bounds.createOrReplaceTempView("df_week_bounds")


In [0]:
df_date_processsed = spark.sql("""SELECT t.*,
                                   w.week_start_date,
                                   w.week_end_date
  FROM df_testing_country_lookup t
  JOIN df_week_bounds w ON t.year_week = w.year_week
 ORDER BY country""")

In [0]:
df_date_processsed.write.format("com.databricks.spark.csv").option("header","true").option("delimiter", ",").mode("overwrite").save("abfss://processed@covidreportdl712.dfs.core.windows.net/ecdc/testing")