In [0]:
%run ./configurations/paths_config

In [0]:
from pyspark.sql.types import StringType,IntegerType,DoubleType,TimestampType,StructType,StructField
from pyspark.sql.functions import col
from pyspark.sql import functions as sf

In [0]:
client_id = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-client-id")
tenant_id = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-tenant-id")
client_secret = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-client-secret")

In [0]:
spark.conf.set("fs.azure.account.auth.type.vsarthista.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.vsarthista.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.vsarthista.dfs.core.windows.net", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret.vsarthista.dfs.core.windows.net", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.vsarthista.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
def path_gen(container,storage_acc):
    return f"abfss://{container}@{storage_acc}.dfs.core.windows.net"

In [0]:
new_schema = StructType(fields=[StructField("circuitId",IntegerType(),False),
                            StructField("circuitRef",StringType(),True),
                            StructField("name",StringType(),True),
                            StructField("location",StringType(),True),
                            StructField("country",StringType(),True),
                            StructField("lat",DoubleType(),True),
                            StructField("lng",DoubleType(),True),
                            StructField("alt",IntegerType(),True),
                            StructField("url",StringType(),True)])

In [0]:
raw_df = spark.read.option("header","true").schema(new_schema).csv(raw_path+"/circuits.csv")

In [0]:
trimmed_df = raw_df.drop("url")

In [0]:
renamed_df = trimmed_df.withColumnRenamed("circuitId","circuit_id").withColumnRenamed("circuitRef","circuit_ref").withColumnRenamed("lat","latitude").withColumnRenamed("lng","longitude").withColumnRenamed("alt","altitude")

In [0]:
audited_df = renamed_df.withColumn("ingestion_time",sf.current_timestamp())

In [0]:
audited_df.write.parquet(processed_path+"/circuits.csv", mode="overwrite")