In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-28")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType

In [0]:
results_schema = StructType(fields=[StructField("resultId", IntegerType(), False),
                                    StructField("raceId", IntegerType(), True),
                                    StructField("driverId", IntegerType(), True),
                                    StructField("constructorId", IntegerType(), True),
                                    StructField("number", IntegerType(), True),
                                    StructField("grid", IntegerType(), True),
                                    StructField("position", IntegerType(), True),
                                    StructField("positionText", StringType(), True),
                                    StructField("positionOrder", IntegerType(), True),
                                    StructField("points", FloatType(), True),
                                    StructField("laps", IntegerType(), True),
                                    StructField("time", StringType(), True),
                                    StructField("milliseconds", IntegerType(), True),
                                    StructField("fastestLap", IntegerType(), True),
                                    StructField("rank", IntegerType(), True),
                                    StructField("fastestLapTime", StringType(), True),
                                    StructField("fastestLapSpeed", FloatType(), True),
                                    StructField("statusId", StringType(), True)])

In [0]:
results_df = spark.read \
.schema(results_schema) \
.json(f"{incraw_folder_path}/{v_file_date}/results.json")

In [0]:

from pyspark.sql.functions import lit

In [0]:
results_with_columns_df = results_df.withColumnRenamed("resultId", "result_id") \
                                    .withColumnRenamed("raceId", "race_id") \
                                    .withColumnRenamed("driverId", "driver_id") \
                                    .withColumnRenamed("constructorId", "constructor_id") \
                                    .withColumnRenamed("positionText", "position_text") \
                                    .withColumnRenamed("positionOrder", "position_order") \
                                    .withColumnRenamed("fastestLap", "fastest_lap") \
                                    .withColumnRenamed("fastestLapTime", "fastest_lap_time") \
                                    .withColumnRenamed("fastestLapSpeed", "fastest_lap_speed") \
                                    .withColumn("data_source", lit(v_data_source)) \
                                    .withColumn("file_date", lit(v_file_date))

In [0]:
results_with_ingestion_date_df = add_ingestion_date(results_with_columns_df)

In [0]:
from pyspark.sql.functions import col

In [0]:
results_final_df = results_with_ingestion_date_df.drop(col("statusId"))

In [0]:
results_deduped_df = results_final_df.dropDuplicates(["race_id","driver_id"])

In [0]:
merge_condition = "tgt.result_id = src.result_id AND tgt.race_id = src.race_id"
merge_delta_data(results_deduped_df,'f1_inc_processed', 'results_dt',incprocessed_folder_path, merge_condition,'race_id')

In [0]:
%sql
select * from f1_inc_processed.results_dt

result_id,race_id,driver_id,constructor_id,number,grid,position,position_text,position_order,points,laps,time,milliseconds,fastest_lap,rank,fastest_lap_time,fastest_lap_speed,data_source,file_date,ingestion_date
20677,351,1,1,2.0,3,,R,19,0.0,35,\N,,31.0,10.0,1:50.750,164.901,ergest_api,2021-03-21,2024-12-21T02:01:22.457Z
20676,351,2,15,22.0,14,,R,18,0.0,36,\N,,29.0,16.0,1:52.475,162.372,ergest_api,2021-03-21,2024-12-21T02:01:22.457Z
20663,351,3,131,4.0,7,5.0,5,5,10.0,61,+49.394,7122973.0,55.0,8.0,1:50.125,165.837,ergest_api,2021-03-21,2024-12-21T02:01:22.457Z
20659,351,4,6,8.0,1,1.0,1,1,25.0,61,1:57:53.579,7073579.0,58.0,1.0,1:47.976,169.137,ergest_api,2021-03-21,2024-12-21T02:01:22.457Z
20674,351,5,205,19.0,19,16.0,16,16,0.0,58,\N,,53.0,18.0,1:53.051,161.544,ergest_api,2021-03-21,2024-12-21T02:01:22.457Z
20665,351,9,4,11.0,8,7.0,7,7,6.0,61,+1:26.559,7160138.0,56.0,3.0,1:49.255,167.157,ergest_api,2021-03-21,2024-12-21T02:01:22.457Z
20675,351,10,166,24.0,18,,R,17,0.0,49,\N,,38.0,19.0,1:53.559,160.822,ergest_api,2021-03-21,2024-12-21T02:01:22.457Z
20666,351,13,6,7.0,24,8.0,8,8,4.0,61,+1:53.297,7186876.0,45.0,12.0,1:52.079,162.945,ergest_api,2021-03-21,2024-12-21T02:01:22.457Z
20681,351,15,205,18.0,21,,R,23,0.0,27,\N,,20.0,21.0,1:56.386,156.915,ergest_api,2021-03-21,2024-12-21T02:01:22.457Z
20667,351,16,10,14.0,15,9.0,9,9,2.0,61,+2:02.416,7195995.0,45.0,15.0,1:52.473,162.374,ergest_api,2021-03-21,2024-12-21T02:01:22.457Z


In [0]:
%sql
select count(*) from f1_inc_processed.results_dt

count(1)
24909


In [0]:
dbutils.notebook.exit("Success")

In [0]:
 %sql
SELECT race_id, driver_id, COUNT(1) 
FROM f1_inc_processed.results_dt
GROUP BY race_id, driver_id
HAVING COUNT(1) > 1
ORDER BY race_id, driver_id DESC;

race_id,driver_id,count(1)


In [0]:
%sql
SELECT race_id, COUNT(1) 
FROM f1_inc_processed.results_dt
GROUP BY race_id
ORDER BY race_id DESC;

race_id,count(1)
1053,20
1052,20
1047,20
1046,20
1045,20
1044,20
1043,20
1042,20
1041,20
1040,20
