In [0]:
%run "../includes/configuration"

In [0]:
from pyspark.sql.functions import col
from pyspark.sql.functions import current_timestamp

In [0]:
races_df = spark.read.format("delta").load(f"{processed_folder_path}/races")\
            .select('race_id', 'race_year' ,'name', 'race_timestamp', 'circuit_id') \
            .withColumnRenamed('name', 'race_name') \
            .withColumnRenamed('race_timestamp', 'race_date')

In [0]:
circuits_df = spark.read.format("delta").load(f"{processed_folder_path}/circuits") \
            .select('circuit_id', 'location') \
            .withColumnRenamed('location', 'circuit_location')

In [0]:
drivers_df = spark.read.format("delta").load(f"{processed_folder_path}/drivers") \
            .select('driver_id', 'name', 'number', 'nationality') \
            .withColumnRenamed('name', 'driver_name') \
            .withColumnRenamed('number', 'driver_number') \
            .withColumnRenamed('nationality', 'driver_nationality')

In [0]:
constructors_df = spark.read.format("delta").load(f"{processed_folder_path}/constructors") \
            .select('constructor_id', 'name') \
            .withColumnRenamed('name', 'team')

In [0]:
results_df = spark.read.format("delta").load(f"{processed_folder_path}/results") \
            .select('driver_id', 'constructor_id', 'race_id', 'result_id', 'grid', 'fastest_lap', 'time', 'points','position') \
            .withColumnRenamed('time', 'race_time')

In [0]:
races_circuits_df = races_df.join(circuits_df, races_df.circuit_id == circuits_df.circuit_id, 'inner')\
                        .select('race_id','race_year','race_name','race_date','circuit_location')

In [0]:
results_drivers_df = results_df.join(drivers_df, results_df.driver_id == drivers_df.driver_id, 'inner') \
                        .select('constructor_id', 'race_id', 'grid', 'fastest_lap', 'race_time', 'points','position','driver_name', 'driver_number', 'driver_nationality')

In [0]:
result_constructor_df = results_drivers_df.join(constructors_df, results_drivers_df.constructor_id == constructors_df.constructor_id, 'inner')\
                        .select('race_id', 'grid', 'fastest_lap', 'race_time', 'points','position','driver_name', 'driver_number', 'driver_nationality', 'team')

In [0]:
races_result_df = races_circuits_df.join(result_constructor_df, races_circuits_df.race_id == result_constructor_df.race_id, 'inner') \
                    .select('race_year','race_name','race_date','circuit_location','driver_name','driver_number','driver_nationality','team','grid','fastest_lap','race_time','points','position') \
                    .withColumn('created_date', current_timestamp())\
                    .orderBy('points', ascending=False)

In [0]:
display(races_result_df)

In [0]:
#races_result_df.write.mode("overwrite").parquet(f"{presentation_folder_path}/races_result")

In [0]:
races_result_df.write.mode("overwrite").format("parquet").saveAsTable("f1_presentation.races_result")

In [0]:
%sql
SELECT * FROM f1_presentation.races_result