#### Ingest races.csv file

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

#### Step 1 - Read the CSV file using the spark dataframe reader

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, DateType

races_schema = StructType(fields= [
    StructField("raceId", IntegerType(), False),
    StructField("year", IntegerType(), True),
    StructField("round", IntegerType(), True),
    StructField("circuitId", IntegerType(), False),
    StructField("name", StringType(), True),
    StructField("date", DateType(), True),
    StructField("time", StringType(), True),
    StructField("url", StringType(), True)
])

In [0]:
races_df = (
    spark.read
    .option("header", True)
    .schema(races_schema)
    .csv(f"{raw_folder_path}/races.csv")
)

In [0]:
display(races_df)

#### Add ingestion date and race_timestamp to the dataframe

In [0]:
from pyspark.sql.functions import to_timestamp, concat, lit, col

races_with_ingestion = add_ingestion_date(races_df)
races_with_timestamp = races_with_ingestion.withColumn(
    "race_timestamp",
    to_timestamp(concat(col("date"), lit(" "), col("time")), 'yyyy-MM-dd HH:mm:ss')
)

In [0]:
display(races_with_timestamp)

In [0]:
races_selected_df = races_with_timestamp.select(col("raceId").alias("race_id"),
                                                col("year").alias("race_year"),
                                                col("round"),
                                                col("circuitId").alias("circuit_id"),
                                                col("name"),
                                                col("ingestion_date"),
                                                col("race_timestamp"))
display(races_selected_df)

In [0]:
races_selected_df.write.mode('overwrite').partitionBy('race_year').format("parquet").saveAsTable("f1_processed.races")
display(spark.read.parquet("/mnt/formulaonedatalk/processed/races"))

In [0]:
%fs
ls /mnt/formulaonedatalk/processed/races

In [0]:
display(spark.read.parquet("/mnt/formulaonedatalk/processed/races"))

In [0]:
dbutils.notebook.exit("Success")