### Ingest laptimes folder

##### Read the multiple csv file from the folder using spark df reader

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date","2021-03-28")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType, FloatType, TimestampType
from pyspark.sql.functions import col, current_timestamp, lit, to_timestamp,concat

In [0]:
laptimes_schema = StructType(fields=[StructField("raceId", IntegerType(), True),
                                     StructField("driverId", IntegerType(), True),
                                      StructField("lap", IntegerType(), True), 
                                      StructField("position", IntegerType(), True),
                                      StructField("time", StringType(), True),
                                      StructField("milliseconds", IntegerType(), True)])

In [0]:
laptimes_df = spark.read \
.schema(laptimes_schema) \
.csv(f"{raw_inc_folder_path}/{v_file_date}/lap_times")

#another way = .csv("dbfs:/mnt/formula1dlstudy/raw/lap_times/lap_times_split*.csv")

##### Select the columns

In [0]:
laptimes_final_df = laptimes_df.withColumnRenamed("raceId", "race_id") \
                               .withColumnRenamed("driverId", "driver_id") \
                               .withColumn("ingestion_date", current_timestamp()) \
                               .withColumn("datasource", lit(v_data_source))\
                               .withColumn("file_date", lit(v_file_date))

Write the result as parquet file

In [0]:
#overwrite_partition(laptimes_final_df, "f1_processed_incremental", "laptimes", "race_id")

In [0]:
merge_condition =  "tgt.race_id = src.race_id AND tgt.driver_id = src.driver_id AND tgt.lap = src.lap AND tgt.race_id = src.race_id"
merge_delta_data(laptimes_final_df, "f1_processed_incremental", "laptimes", processed_inc_folder_path, merge_condition, "race_id")

In [0]:
%sql
SELECT * FROM f1_processed_incremental.laptimes

In [0]:
display(spark.read.format("delta").load(f"{processed_inc_folder_path}/laptimes"))

In [0]:
%sql
SELECT race_id,count(race_id) FROM f1_processed_incremental.laptimes
GROUP BY race_id


In [0]:
dbutils.notebook.exit("Success")