In [0]:
from pyspark.sql.types import StringType,IntegerType,DoubleType,TimestampType,StructType,StructField
from pyspark.sql.functions import col
from pyspark.sql import functions as sf

In [0]:
client_id = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-client-id")
tenant_id = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-tenant-id")
client_secret = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-client-secret")

In [0]:
spark.conf.set("fs.azure.account.auth.type.vsarthista.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.vsarthista.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.vsarthista.dfs.core.windows.net", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret.vsarthista.dfs.core.windows.net", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.vsarthista.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
def path_gen(container,storage_acc):
    return f"abfss://{container}@{storage_acc}.dfs.core.windows.net"

In [0]:
new_schema = StructType(fields=[StructField("qualifyId",IntegerType(),False),
                            StructField("raceId",IntegerType(),True),
                            StructField("driverId",IntegerType(),True),
                            StructField("constructorId",IntegerType(),True),
                            StructField("number",IntegerType(),True),
                            StructField("position",IntegerType(),True),
                            StructField("q1",StringType(),True),
                            StructField("q2",StringType(),True),
                            StructField("q3 ",StringType(),True),])

In [0]:
raw_df = spark.read.option("multiline",True).schema(new_schema).json(path_gen("raw","vsarthista")+"/qualifying")

In [0]:
raw_df.display()

In [0]:
renamed_df = raw_df.withColumnRenamed("qualifyId","qualify_id").withColumnRenamed("raceId","race_id").withColumnRenamed("driverId","driver_id")

In [0]:
audited_df = renamed_df.withColumn("ingestion_time",sf.current_timestamp())

In [0]:
audited_df.write.parquet(path_gen("processed","vsarthista")+"/qualifying", mode="overwrite")

In [0]:
df = spark.read.parquet(path_gen("processed","vsarthista")+"/qualifying")
df.display()