In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
dbutils.widgets.text("p_file_date", "2021-03-21")
v_file_date= dbutils.widgets.get("p_file_date")

In [0]:
from pyspark.sql.functions import current_timestamp

In [0]:
from pyspark.sql.functions import sum, countDistinct 

In [0]:
drivers_df = spark.read.format("delta").load(f"{processed_folder_path}/drivers").withColumnRenamed("name", "driver_name") \
                                                                   .withColumnRenamed("number", "driver_number") \
                                                                   .withColumnRenamed("nationality", "driver_nationality")

In [0]:
circuits_df = spark.read.format("delta").load(f"{processed_folder_path}/circuits").withColumnRenamed("location", "circuit_location")

In [0]:
contructors_df = spark.read.format("delta").load(f"{processed_folder_path}/constructors").withColumnRenamed("name", "team")

In [0]:
races_df = spark.read.format("delta").load(f"{processed_folder_path}/races").withColumnRenamed("name", "race_name") \
                                                               .withColumnRenamed("race_timestamp", "race_date")

In [0]:
results_df = spark.read.format("delta").load(f"{processed_folder_path}/results") \
                        .filter(f"file_date = '{v_file_date}'") \
                        .withColumnRenamed("time", "race_time") \
                        .withColumnRenamed("race_id", "result_race_id") \
                        .withColumnRenamed("file_date", "result_file_date")


## Join circuits to races

In [0]:
race_circuits_df = races_df.join(circuits_df, races_df.circuit_id == circuits_df.circuit_id, "inner") \
    .select(races_df.race_id, races_df.race_year, races_df.race_name, races_df.race_date, circuits_df.circuit_location)

## Join results to drivers and constructors

In [0]:
race_results_df = results_df.join(race_circuits_df, results_df.result_race_id == race_circuits_df.race_id) \
                            .join(drivers_df, results_df.driver_id == drivers_df.driver_id) \
                            .join(contructors_df, results_df.constructor_id == contructors_df.constructor_id)

In [0]:
final_df = race_results_df.select("race_year", 
                                  "race_name", 
                                  "race_id",
                                  "race_date", 
                                  "circuit_location", 
                                  "driver_name", 
                                  "driver_number", 
                                  "driver_nationality", 
                                  "team", 
                                  "grid",
                                  "points", 
                                  "fastest_lap", 
                                  "race_time",
                                  "position",
                                  "result_file_date")\
        .withColumn("created_date", current_timestamp())\
          .withColumnRenamed("result_file_date", "file_date")


In [0]:
%sql
--DROP TABLE f1_presentation.race_results;


#write to db

In [0]:
# final_df.write.mode("overwrite").parquet(f"{presentation_folder_path}/race_results")

In [0]:
# final_df.write.mode("overwrite").format("parquet").saveAsTable("f1_presentation.race_results")

In [0]:
final_df = move_column_to_last(final_df, "race_id")

In [0]:
merge_condition = "tgt.driver_name = src.driver_name AND tgt.race_id = src.race_id"
merge_delta_data(final_df, 'f1_presentation', 'race_results', presentation_folder_path, merge_condition, 'race_id')

In [0]:
# write_to_database(final_df, "f1_presentation", "race_results", "race_id")

In [0]:
%sql
SELECT * FROM f1_presentation.race_results ORDER BY file_date DESC;