
## Ingest drives.json file

Path : dbfs:/mnt/formula1dlsaga/raw/drivers.json

In [0]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType, DoubleType, TimestampType
from pyspark.sql.functions import col, lit, to_timestamp, current_timestamp, concat

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_data_source", "")
v_data_source = dbutils.widgets.get("p_data_source")

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date= dbutils.widgets.get("p_file_date")


In [0]:
name_schema = StructType(fields=[StructField("forename", StringType(), True),
                                 StructField("surname", StringType(), True)])

In [0]:
drivers_schema = StructType(fields=[StructField("driverId", IntegerType(), False),
                                      StructField("driverRef", StringType(), True),
                                      StructField("number", IntegerType(), True),
                                      StructField("code", StringType(), True),
                                      StructField("name", name_schema),
                                      StructField("dob", DateType(), True),
                                      StructField("nationality", StringType(), True),
                                      StructField("url", StringType(), True)])

In [0]:
drivers_df = spark.read.json(f"{raw_folder_path}/{v_file_date}/drivers.json", schema=drivers_schema)

In [0]:
drivers_with_conlumns_df_func = drivers_df.withColumnRenamed("driverId", "driver_id")\
                                    .withColumnRenamed("driverRef", "driver_ref")\
                                    .withColumn("name", concat(col("name.forename"), lit(" "), col("name.surname")))\
                                    .withColumn("data_source", lit(v_data_source))\
                                    .withColumn("file_date", lit(v_file_date)) \
                                    .drop("url")

In [0]:
drivers_with_conlumns_df = add_ingestion_date(drivers_with_conlumns_df_func)

In [0]:
# drivers_with_conlumns_df.write.parquet(f"{processed_folder_path}/drivers", mode="overwrite")

In [0]:
drivers_with_conlumns_df.write.mode("overwrite").format("delta").saveAsTable("f1_processed.drivers")

In [0]:
%sql
SELECT * FROM f1_processed.drivers;

In [0]:
# df = spark.read.parquet(f"{processed_folder_path}/drivers")
# display(df)

In [0]:
dbutils.notebook.exit("Success")