### Ingest races.csv file

##### Read the CSV file using spark df reader

In [0]:
dbutils.widgets.text("p_data_source","")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType
from pyspark.sql.functions import col, current_timestamp, lit, to_timestamp,concat

In [0]:
races_schema = StructType(fields=[StructField("raceId", IntegerType(), False), 
                                  StructField("year", IntegerType(), True), 
                                  StructField("round", IntegerType(), True), 
                                  StructField("circuitId", IntegerType(), True), 
                                  StructField("name", StringType(), True), 
                                  StructField("date", StringType(), True), 
                                  StructField("time", StringType(), True), 
                                  StructField("url", StringType(), True)
                                  ])

In [0]:
races_df = spark.read \
.option("header", "true") \
.schema(races_schema) \
.csv(f"{raw_folder_path}/races.csv")

In [0]:
races_with_timestamp_df = races_df.withColumn("ingestion_date", current_timestamp()) \
                .withColumn("race_timestamp", to_timestamp(concat(col("date"), lit(' '), col("time")), 'yyyy-MM-dd HH:mm:ss')) \
                    .withColumn("datasource", lit(v_data_source))
                                

##### Select the columns

In [0]:
races_selected_df = races_with_timestamp_df.select(col("raceId"), col("year"), col("round"), col("circuitId"), col("name"),col("race_timestamp"),col("ingestion_date"),col("datasource"))

In [0]:
races_final_df = races_selected_df.withColumnRenamed("raceId", "race_id") \
                                    .withColumnRenamed("year", "race_year") \
                                    .withColumnRenamed("circuitId", "circuit_id")
                                          

In [0]:
display(races_final_df)

Write the result as parquet file

In [0]:
#races_final_df.write.mode("overwrite").parquet(f"{processed_folder_path}/races")

In [0]:
races_final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_processed.races")

- > " In the result we see part-00000 below, becuase we didn't partition the data*/"

In [0]:
#%fs
#ls /mnt/formula1dlstudy/processed/races

In [0]:
display(spark.read.parquet(f"{processed_folder_path}/races"))

In [0]:
%sql
SELECT * FROM f1_processed.races

In [0]:
dbutils.notebook.exit("Success")