In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-28")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [0]:
pit_stops_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                      StructField("driverId", IntegerType(), True),
                                      StructField("stop", StringType(), True),
                                      StructField("lap", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("duration", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)
                                     ])

In [0]:
pit_stops_df = spark.read \
.schema(pit_stops_schema) \
.option("multiLine", True) \
.json(f"{incraw_folder_path}/{v_file_date}/pit_stops.json")

In [0]:
pit_stops_with_ingestion_date_df = add_ingestion_date(pit_stops_df)

In [0]:
from pyspark.sql.functions import lit

In [0]:
final_df = pit_stops_with_ingestion_date_df.withColumnRenamed("driverId", "driver_id") \
.withColumnRenamed("raceId", "race_id") \
.withColumn("ingestion_date", current_timestamp()) \
.withColumn("data_source", lit(v_data_source)) \
.withColumn("file_date", lit(v_file_date))


In [0]:
display(final_df)

race_id,driver_id,stop,lap,time,duration,milliseconds,ingestion_date,data_source,file_date
1053,839,1,1,15:05:16,30.866,30866,2024-12-22T06:57:57.392Z,,2021-04-18
1053,20,1,3,15:10:09,32.024,32024,2024-12-22T06:57:57.392Z,,2021-04-18
1053,854,1,5,15:15:11,51.007,51007,2024-12-22T06:57:57.392Z,,2021-04-18
1053,853,1,12,15:27:20,31.168,31168,2024-12-22T06:57:57.392Z,,2021-04-18
1053,842,1,14,15:30:10,31.068,31068,2024-12-22T06:57:57.392Z,,2021-04-18
1053,20,2,20,15:39:11,31.184,31184,2024-12-22T06:57:57.392Z,,2021-04-18
1053,854,2,21,15:41:24,32.479,32479,2024-12-22T06:57:57.392Z,,2021-04-18
1053,20,3,22,15:42:52,39.502,39502,2024-12-22T06:57:57.392Z,,2021-04-18
1053,853,2,23,15:45:20,31.5,31500,2024-12-22T06:57:57.392Z,,2021-04-18
1053,852,1,25,15:46:39,30.696,30696,2024-12-22T06:57:57.392Z,,2021-04-18


In [0]:
overwrite_partition(final_df, 'f1_inc_processed', 'pit_stops', 'race_id')

In [0]:
dbutils.notebook.exit("Success")