In [0]:
from pyspark.sql.types import StringType,IntegerType,DoubleType,TimestampType,DateType,StructType,StructField
from pyspark.sql.functions import col,lit
from pyspark.sql import functions as sf
import json


In [0]:
client_id = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-client-id")
tenant_id = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-tenant-id")
client_secret = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-client-secret")

In [0]:
spark.conf.set("fs.azure.account.auth.type.vsarthista.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.vsarthista.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.vsarthista.dfs.core.windows.net", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret.vsarthista.dfs.core.windows.net", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.vsarthista.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
def path_gen(container,storage_acc):
    return f"abfss://{container}@{storage_acc}.dfs.core.windows.net"

In [0]:
new_schema = StructType(fields=[StructField("code",StringType(),True),
                                StructField("dob",DateType(),True),
                                StructField("driverId",IntegerType(),False),
                                StructField("name",StructType()\
                                    .add("forename",StringType())\
                                    .add("surname",StringType()),True),
                                StructField("nationality",StringType(),True),
                                StructField("number",IntegerType(),True),
                                StructField("url",StringType(),True)])

In [0]:
raw_df = spark.read.schema(new_schema).json(path_gen("raw","vsarthista")+"/drivers.json") 

In [0]:
raw_df.limit(5).display()

In [0]:
trasnformed_df = raw_df.withColumn("name",sf.expr("concat(name.forename,' ',name.surname)"))

In [0]:
trimmed_df = trasnformed_df.select(col('code'),col('dob').alias('date_of_birth'),col('driverId').alias('driver_id'),col('name'),col('nationality'),col('number'))
trimmed_df.limit(5).display()

In [0]:
audited_df = trimmed_df.withColumn("ingestion_timestamp",sf.current_timestamp())

In [0]:
audited_df.write.mode("overwrite").parquet(path_gen("processed","vsarthista")+"/drivers.json")

In [0]:
df = spark.read.parquet(path_gen("processed","vsarthista")+"/drivers.json")

In [0]:
df.display()