
##Ingest multiple laptime.csv files into parquet

###%fs ls /mnt/formula1dlsaga/raw/lap_times

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, TimestampType
from pyspark.sql.functions import col, current_timestamp, lit

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date= dbutils.widgets.get("p_file_date")

In [0]:
laptimes_schema = StructType(fields=[StructField("raceId", IntegerType(), False),
                                  StructField("driverId", IntegerType(), True),
                                  StructField("lap", IntegerType(), True),
                                  StructField("position", IntegerType(), True),
                                  StructField("time", StringType(), True),
                                  StructField("milliseconds", IntegerType(), True)]) 

In [0]:
laptimes_df = spark.read.schema(laptimes_schema).csv(f"{raw_folder_path}/{v_file_date}/lap_times")

In [0]:
laptimes_final_df_func = laptimes_df.withColumnRenamed("raceId", "race_id") \
                              .withColumnRenamed("driverId", "driver_id") \
                              .withColumn("data_source", lit(v_data_source))\
                              .withColumn("file_date", lit(v_file_date))

In [0]:
laptimes_final_df = add_ingestion_date(laptimes_final_df_func)

In [0]:
laptimes_final_df = move_column_to_last(laptimes_final_df, "race_id")

In [0]:
merge_condition = "tgt.race_id = src.race_id AND tgt.driver_id = src.driver_id AND tgt.lap = src.lap AND tgt.race_id = src.race_id"
merge_delta_data(laptimes_final_df, 'f1_processed', 'lap_times', processed_folder_path, merge_condition, 'race_id')

In [0]:
%sql
--DROP TABLE f1_processed.lap_times;

In [0]:
# laptimes_final_df.write.parquet(f"{processed_folder_path}/lap_times", mode="overwrite")

In [0]:
# write_to_database(laptimes_final_df, "f1_processed", "lap_times", "race_id")

In [0]:
# laptimes_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.lap_times")

In [0]:
%sql
SELECT race_id, COUNT(1) FROM f1_processed.lap_times GROUP BY race_id ORDER BY race_id DESC LIMIT 5;

race_id,count(1)
1053,1124
1052,1026
1047,1043
1046,1531
1045,1016


In [0]:
dbutils.notebook.exit("Success")