In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "/Formula1/includes/configuration"

In [0]:
schema = "race_id int, race_year int, round int, circuit_id int, name String, date String, time String"

file_path = f'{raw_folder_path}/races.csv'
race_df = spark.read.format('csv').schema(schema).option('header',True).load(file_path)

race_df.show(3)

+-------+---------+-----+----------+--------------------+----------+--------+
|race_id|race_year|round|circuit_id|                name|      date|    time|
+-------+---------+-----+----------+--------------------+----------+--------+
|      1|     2009|    1|         1|Australian Grand ...|2009-03-29|06:00:00|
|      2|     2009|    2|         2|Malaysian Grand Prix|2009-04-05|09:00:00|
|      3|     2009|    3|        17|  Chinese Grand Prix|2009-04-19|07:00:00|
+-------+---------+-----+----------+--------------------+----------+--------+
only showing top 3 rows



Out[13]: [FileInfo(path='dbfs:/mnt/formularacedata/presentation/', name='presentation/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/formularacedata/processed/', name='processed/', size=0, modificationTime=0),
 FileInfo(path='dbfs:/mnt/formularacedata/raw/', name='raw/', size=0, modificationTime=0)]

Transformation of race data

In [0]:
from pyspark.sql.functions import to_timestamp,concat, current_timestamp,concat_ws
race_df_transformed = race_df.select('race_id','race_year', 'round', 'circuit_id', 'name', to_timestamp(concat_ws(" ",race_df.date,race_df.time)).alias('race_timestamp'), current_timestamp().alias('ingestion_date') )
race_df_transformed.show(3)

+-------+---------+-----+----------+--------------------+-------------------+--------------------+
|race_id|race_year|round|circuit_id|                name|     race_timestamp|      ingestion_date|
+-------+---------+-----+----------+--------------------+-------------------+--------------------+
|      1|     2009|    1|         1|Australian Grand ...|2009-03-29 06:00:00|2023-06-29 19:13:...|
|      2|     2009|    2|         2|Malaysian Grand Prix|2009-04-05 09:00:00|2023-06-29 19:13:...|
|      3|     2009|    3|        17|  Chinese Grand Prix|2009-04-19 07:00:00|2023-06-29 19:13:...|
+-------+---------+-----+----------+--------------------+-------------------+--------------------+
only showing top 3 rows



ingest race data in process folder with partition on race_year and in parquet format

In [0]:
display(dbutils.fs.ls('/mnt/formularacedata/'))

path,name,size,modificationTime
dbfs:/mnt/formularacedata/presentation/,presentation/,0,0
dbfs:/mnt/formularacedata/processed/,processed/,0,0
dbfs:/mnt/formularacedata/raw/,raw/,0,0


In [0]:
race_df_transformed.write.mode("overwrite").partitionBy('race_year').format("delta").saveAsTable("f1_processed.races")

In [0]:
%sql

SELECT * FROM f1_processed.races LIMIT 5

race_id,race_year,round,circuit_id,name,race_timestamp,ingestion_date
833,1950,1,9,British Grand Prix,,2023-06-29T19:13:02.063+0000
834,1950,2,6,Monaco Grand Prix,,2023-06-29T19:13:02.063+0000
835,1950,3,19,Indianapolis 500,,2023-06-29T19:13:02.063+0000
836,1950,4,66,Swiss Grand Prix,,2023-06-29T19:13:02.063+0000
837,1950,5,13,Belgian Grand Prix,,2023-06-29T19:13:02.063+0000


In [0]:
dbutils.notebook.exit("success")

success