In [0]:
from pyspark.sql.types import StringType,IntegerType,DoubleType,TimestampType,DateType,StructType,StructField,DoubleType
from pyspark.sql.functions import col,lit
from pyspark.sql import functions as sf
import json


In [0]:
client_id = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-client-id")
tenant_id = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-tenant-id")
client_secret = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-client-secret")

In [0]:
spark.conf.set("fs.azure.account.auth.type.vsarthista.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.vsarthista.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.vsarthista.dfs.core.windows.net", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret.vsarthista.dfs.core.windows.net", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.vsarthista.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
def path_gen(container,storage_acc):
    return f"abfss://{container}@{storage_acc}.dfs.core.windows.net"

In [0]:
new_schema = StructType(fields=[StructField("constructorId",IntegerType(),False),
                                StructField("driverId",IntegerType(),False),
                                StructField("fastestLap",IntegerType(),True),
                                StructField("fastestLapSpeed",DoubleType(),True),
                                StructField("fastestLapTime",StringType(),True),
                                StructField("grid",IntegerType (),True),
                                StructField("laps",IntegerType(),True),
                                StructField("milliseconds",IntegerType(),True),
                                StructField("number",IntegerType(),True),
                                StructField("points",IntegerType(),True),
                                StructField("position",IntegerType(),True),
                                StructField("positionOrder",IntegerType(),True),
                                StructField("positionText",IntegerType(),True),
                                StructField("raceId",IntegerType(),True),
                                StructField("rank",IntegerType(),True),
                                StructField("resultId",IntegerType(),True),
                                StructField("statusId",IntegerType(),True),
                                StructField("time",StringType(),True)
                                ])

In [0]:
raw_df = spark.read.schema(new_schema).json(path_gen("raw","vsarthista")+"/results.json") 

In [0]:
renamed_df = raw_df.withColumnRenamed("constructorId","constructor_id").withColumnRenamed("driverId","driver_id").withColumnRenamed("fastestLap","fastest_lap").withColumnRenamed("fastestLapSpeed","fastest_lap_speed").withColumnRenamed("fastestLapTime","fastest_lap_time").withColumnRenamed("positionOrder","position_order").withColumnRenamed("positionText","position_text").withColumnRenamed("raceId","race_id").withColumnRenamed("resultId","result_id")

In [0]:
trimmed_df = renamed_df.drop("statusId")

In [0]:
audited_df = trimmed_df.withColumn("ingestion_timestamp",sf.current_timestamp())

In [0]:
audited_df.write.partitionBy("race_id").mode("overwrite").parquet(path_gen("processed","vsarthista")+"/results.json")

In [0]:
df = spark.read.parquet(path_gen("processed","vsarthista")+"/results.json")

In [0]:
df.display()

In [0]:
audited_df.limit(5).display()