## Developing dataset for BBC Formula 1 race records

In [0]:
from pyspark.sql.functions import col,expr,current_timestamp,lit

In [0]:
%run ../configurations/paths_config

In [0]:
%run ../utils/etl_support_functions

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS formula1_gold;

In [0]:
%run ../configurations/paths_config

In [0]:
driver_df = spark.read.table("vsarthicat.formula1_silver.drivers")

In [0]:
circuits_df = spark.read.table("vsarthicat.formula1_silver.circuits")

In [0]:
race_df = spark.read.table("vsarthicat.formula1_silver.races").withColumn("race_date", expr("substring(race_timestamp, 1, 10)"))

In [0]:
results_df = spark.read.table("vsarthicat.formula1_silver.results")

In [0]:
constructors_df = spark.read.table("vsarthicat.formula1_silver.constructors")

### Data to be picked - 

- race_year->races
- race_name->races
- race_date->races
- circuit_location->circuits
- driver_name->drivers
- driver_number->drivers
- driver_nationality->drivers
- grid->results
- team->constructors
- fastest_lap->results
- race_time->races
- points->results


In [0]:
joint_df = results_df.join(race_df,"race_id")\
.join(circuits_df,"circuit_id")\
.join(driver_df,"driver_id")\
.join(constructors_df,"constructor_id")\
.select(race_df.year.alias("race_year"),race_df.name.alias("race_name"),race_df.race_date,
        circuits_df.name.alias("circuit_name"),driver_df.name.alias("driver_name"),
        driver_df.number.alias("driver_number"),driver_df.nationality.alias("driver_nationality"),
        constructors_df.name.alias("team"),col("grid"),col("fastest_lap"),results_df.time.alias("race_time"),col("position"),
        col("points"),col("result_id"),col("race_id"))

In [0]:
joint_df = joint_df.filter(col("driver_name") == "Nico Rosberg").filter(col("race_year") == 2012)

##SCD2 Implementation

In [0]:
merge_src_df = joint_df\
    .withColumn("file_date",lit(v_file_date))\
    .withColumn("current_timestamp",current_timestamp())\
    .withColumn("ending_timestamp",lit("NULL"))

In [0]:
# isTableExists = spark.catalog.tableExists("vsarthicat.formula1_gold.race_results")

In [0]:
# if not isTableExists:


#     merge_src_df.write.mode("overwrite").format("delta").saveAsTable("vsarthicat.formula1_gold.race_results")

# else:
#     joint_df.createOrReplaceGlobalTempView("joint_race_results")
#     dbutils.notebook.run("../utils/store_df_in_scd2_format",600,
#                 {"p_target_table_path": "vsarthicat.formula1_gold.race_results",
#                  "p_src_view": "global_temp.joint_race_results",
#                  "p_merge_key_cols": "result_id,race_id"})
                 

In [0]:
merge_condition = "tgt.race_id = src.race_id AND tgt.race_year = src.race_year"
merge_delta_data(merge_src_df, "vsarthicat.formula1_gold", 'race_results',merge_condition, 'race_year')

In [0]:
# %sql
# DROP TABLE vsarthicat.formula1_gold.race_results;