## Developing dataset for BBC Formula 1 race records

In [0]:
from pyspark.sql.functions import col

In [0]:
%run ./configurations/paths_config

In [0]:
client_id = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-client-id")
tenant_id = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-tenant-id")
client_secret = dbutils.secrets.get(scope = "vsarthi-scope", key = "vsarthista-client-secret")

In [0]:
spark.conf.set("fs.azure.account.auth.type.vsarthista.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.vsarthista.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.vsarthista.dfs.core.windows.net", client_id)
spark.conf.set("fs.azure.account.oauth2.client.secret.vsarthista.dfs.core.windows.net", client_secret)
spark.conf.set("fs.azure.account.oauth2.client.endpoint.vsarthista.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

In [0]:
driver_df = spark.read.parquet(processed_path+"/drivers.json").select(col("driver_id"),col("name").alias("driver_name"),col("number").alias("driver_number"), col("nationality"))
driver_df.display()

In [0]:
circuits_df = spark.read.parquet(processed_path+"/circuits.csv").select(col("name").alias("circuit_name"),col("circuit_id"),col("location"),col("circuit_id"))
circuits_df.display()

In [0]:
race_df = spark.read.parquet(processed_path+"/races.csv").select(col("year").alias("race_year"),col("race_timestamp"),col("race_id"),col("circuit_id"))
race_df.display()

### Data to be picked - 

- race_year->races -->
- circuit_name->circuits
- driver_id->drivers
- driver_name->drivers
- driver_number->drivers
- grid->results
- count(pits)->pitStops
- fastest_lap->results
- race_time->races
- points->results


In [0]:
results_df = spark.read.parquet(processed_path+"/results.json").select(col("grid"),col("fastest_lap"),col("points"),col("time").alias("race_time"),col("race_id"),col("driver_id"))
results_df.display()

In [0]:
pit_stops_df = spark.read.parquet(processed_path+"/pit_stops.json").select()
pit_stops_df.display()

In [0]:
count_pit_stops_df = driver_df.join(pit_stops_df,driver_df.driver_id==pit_stops_df.driver_id,"inner").groupBy(pit_stops_df.race_id).count()

In [0]:
display(count_pit_stops)

In [0]:
joint_df = results_df.join(driver_df,driver_df.driver_id == results_df.driver_id,"inner").join(race_df, race_df.race_id == results_df.race_id,"inner").join(circuits_df, circuits_df.circuit_id == race_df.circuit_id,"inner").join(count_pit_stops_df, count_pit_stops_df.race_id == race_df.race_id,"inner").drop("driver_id","circuit_id","race_id")
joint_df.display()