
##Ingest qualifying_split.json

%fs ls /mnt/formula1dlsaga/raw/qualifying

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, TimestampType
from pyspark.sql.functions import col, current_timestamp, lit

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date= dbutils.widgets.get("p_file_date")


In [0]:
qualifying_schema = StructType(fields=[
    StructField("qualifyId", IntegerType(), False),
    StructField("raceId", IntegerType(), False),
    StructField("driverId", IntegerType(), False),
    StructField("constructorId", IntegerType(), False),
    StructField("number", IntegerType(), True),
    StructField("position", IntegerType(), True),
    StructField("q1", StringType(), True),
    StructField("q2", StringType(), True),
    StructField("q3", StringType(), True),
])

In [0]:
qualifying_df = spark.read.schema(qualifying_schema).option("multiLine", True).json(f"{raw_folder_path}/{v_file_date}/qualifying")

In [0]:
qualifying_final_df_func = qualifying_df.withColumnRenamed("qualifyId", "qualify_id") \
                            .withColumnRenamed("raceId", "race_id") \
                            .withColumnRenamed("constructorId", "constructor_id") \
                            .withColumnRenamed("driverId", "driver_id") \
                            .withColumn("data_source", lit(v_data_source)) \
                            .withColumn("file_date", lit(v_file_date)) 

In [0]:
qualifying_final_df = add_ingestion_date(qualifying_final_df_func)

In [0]:
qualifying_final_df = move_column_to_last(qualifying_final_df, "race_id")

In [0]:
merge_condition = "tgt.qualify_id = src.qualify_id AND tgt.race_id = src.race_id"
merge_delta_data(qualifying_final_df, 'f1_processed', 'qualifying', processed_folder_path, merge_condition, 'race_id')

In [0]:
# qualifying_final_df.write.parquet(f"{processed_folder_path}/qualifying", mode="overwrite")

In [0]:
# write_to_database(qualifying_final_df, "f1_processed", "qualifying", "race_id")

In [0]:
# qualifying_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.qualifying")

In [0]:
%sql
SELECT race_id, COUNT(1) FROM f1_processed.qualifying GROUP BY race_id ORDER BY race_id DESC LIMIT 5;

race_id,count(1)
1053,20
1052,20
1047,20
1046,20
1045,20


In [0]:
dbutils.notebook.exit("Success")