In [0]:
dbutils.widgets.text("p_file_date","2021-03-21")
v_file_date = dbutils.widgets.get("p_file_date")

In [0]:
%run "../includes/configuration"

In [0]:
%run "../includes/common_functions"

In [0]:
from pyspark.sql.functions import col,lit
from pyspark.sql.functions import current_timestamp

In [0]:
races_df = spark.read.format("delta").load(f"{processed_inc_folder_path}/races")\
            .select('race_id', 'race_year' ,'name', 'race_timestamp', 'circuit_id') \
            .withColumnRenamed('name', 'race_name') \
            .withColumnRenamed('race_timestamp', 'race_date')\
            .withColumnRenamed('race_id', 'races_race_id')

In [0]:
circuits_df = spark.read.format("delta").load(f"{processed_inc_folder_path}/circuits") \
            .select('circuit_id', 'location') \
            .withColumnRenamed('location', 'circuit_location')

In [0]:
drivers_df = spark.read.format("delta").load(f"{processed_inc_folder_path}/drivers") \
            .select('driver_id', 'name', 'number', 'nationality') \
            .withColumnRenamed('name', 'driver_name') \
            .withColumnRenamed('number', 'driver_number') \
            .withColumnRenamed('nationality', 'driver_nationality')

In [0]:
constructors_df = spark.read.format("delta").load(f"{processed_inc_folder_path}/constructors") \
            .select('constructor_id', 'name') \
            .withColumnRenamed('name', 'team')

In [0]:
results_df = spark.read.format("delta").load(f"{processed_inc_folder_path}/results") \
            .filter(f"file_date = '{v_file_date}'") \
            .select('driver_id', 'constructor_id', 'race_id', 'result_id', 'grid', 'fastest_lap', 'time', 'points','position',col('file_date').alias ('result_file_date')) \
            .withColumnRenamed('time', 'race_time')\
            .withColumnRenamed('race_id', 'result_race_id')

In [0]:
races_circuits_df = races_df.join(circuits_df, races_df.circuit_id == circuits_df.circuit_id, 'inner')\
                        .select('races_race_id','race_year','race_name','race_date','circuit_location')

In [0]:
results_drivers_df = results_df.join(drivers_df, results_df.driver_id == drivers_df.driver_id, 'inner') \
                        .select('constructor_id', 'result_race_id', 'grid', 'fastest_lap', 'race_time', 'points','position','driver_name', 'driver_number', 'driver_nationality','result_file_date')

In [0]:
result_constructor_df = results_drivers_df.join(constructors_df, results_drivers_df.constructor_id == constructors_df.constructor_id, 'inner')\
                        .select('result_race_id', 'grid', 'fastest_lap', 'race_time', 'points','position','driver_name', 'driver_number', 'driver_nationality', 'team','result_file_date')

In [0]:
races_result_df = races_circuits_df.join(result_constructor_df, races_circuits_df.races_race_id == result_constructor_df.result_race_id, 'inner') \
                    .select('race_year','race_name','race_date','circuit_location','driver_name','driver_number','driver_nationality','team','grid','fastest_lap','race_time','points','position',result_constructor_df.result_race_id.alias('race_id'),col('result_file_date').alias('file_date')) \
                    .withColumn('created_date', current_timestamp())\
                    .orderBy('points', ascending=False)

In [0]:
display(races_result_df)

In [0]:
#overwrite_partition(races_result_df, "f1_presentation_incrementals", "races_result", "race_id")

In [0]:
merge_condition =  "tgt.driver_name = src.driver_name AND tgt.race_id = src.race_id"
merge_delta_data(races_result_df, "f1_presentation_incrementals", "races_result", presentation_inc_folder_path, merge_condition, "race_id")

In [0]:
%sql
SELECT * FROM f1_presentation_incrementals.races_result

In [0]:
%sql
SELECT race_id,count(race_id) FROM f1_presentation_incrementals.races_result
GROUP BY race_id

In [0]:
display(spark.read.format("delta").load(f"{presentation_inc_folder_path}/races_result"))