#  F1 Data Engineering Project - Gold Layer
## Business-Level Aggregations & Analytics Tables

**Gold Tables Created:**
1. **fact_race_results** - Denormalized race results with all dimensions
2. **dim_driver_career** - Driver career statistics
3. **dim_constructor_performance** - Constructor/Team performance metrics
4. **agg_season_standings** - Season championship aggregations
5. **agg_circuit_statistics** - Circuit-level analytics
6. **agg_driver_vs_teammate** - Teammate comparison metrics

## Configuration

In [0]:
# CONFIGURATION

CATALOG_NAME = "f1_dev"
SILVER_SCHEMA = "silver"
GOLD_SCHEMA = "gold"

# Full table references
silver_db = f"{CATALOG_NAME}.{SILVER_SCHEMA}"
gold_db = f"{CATALOG_NAME}.{GOLD_SCHEMA}"

print(f"Catalog: {CATALOG_NAME}")
print(f"Silver Schema: {silver_db}")
print(f"Gold Schema: {gold_db}")

Catalog: f1_dev
Silver Schema: f1_dev.silver
Gold Schema: f1_dev.gold


## Setup Unity Catalog

In [0]:
# Set catalog context
spark.sql(f"USE CATALOG {CATALOG_NAME}")

# Create Gold schema if not exists
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {CATALOG_NAME}.{GOLD_SCHEMA}")
spark.sql(f"USE SCHEMA {GOLD_SCHEMA}")

print(f"Using: {CATALOG_NAME}.{GOLD_SCHEMA}")

Using: f1_dev.gold


## Import Libraries

In [0]:
from pyspark.sql.functions import (
    col, sum, avg, count, max, min, first, last,
    when, rank, dense_rank, row_number, lag, lead,
    collect_list, concat_ws, lit, round as spark_round,
    current_timestamp, year, countDistinct
)
from pyspark.sql.window import Window

## 1. Fact Race Results

A fully denormalized fact table joining all dimensions for easy reporting.

In [0]:
# Read Silver tables
results = spark.table(f"{silver_db}.results")
races = spark.table(f"{silver_db}.races")
drivers = spark.table(f"{silver_db}.drivers")
constructors = spark.table(f"{silver_db}.constructors")
circuits = spark.table(f"{silver_db}.circuits")
status = spark.table(f"{silver_db}.status")

# Create denormalized fact table
fact_race_results = results \
    .join(races, "race_id") \
    .join(drivers, "driver_id") \
    .join(constructors, "constructor_id") \
    .join(circuits, "circuit_id") \
    .join(status, "status_id", "left") \
    .select(
        # Result facts
        results.result_id,
        results.race_id,
        results.driver_id,
        results.constructor_id,
        results.grid_position,
        results.finish_position,
        results.position_text,
        results.points_earned,
        results.laps_completed,
        results.finish_time,
        results.time_milliseconds,
        results.fastest_lap_number,
        results.fastest_lap_rank,
        results.fastest_lap_time,
        results.fastest_lap_speed_kph,
        results.is_winner,
        results.is_podium,
        results.is_points_finish,
        results.positions_gained,
        
        # Race dimensions
        races.race_year,
        races.race_round,
        races.race_name,
        races.race_date,
        
        # Driver dimensions
        drivers.full_name.alias("driver_name"),
        drivers.driver_code,
        drivers.driver_nationality,
        
        # Constructor dimensions
        constructors.constructor_name,
        constructors.constructor_nationality,
        
        # Circuit dimensions
        circuits.circuit_id,
        circuits.circuit_name,
        circuits.circuit_location,
        circuits.circuit_country,
        
        # Status dimensions
        status.status_description,
        status.status_category
    ) \
    .withColumn("updated_at", current_timestamp())

# Write to Gold
fact_race_results.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .partitionBy("race_year") \
    .saveAsTable(f"{gold_db}.fact_race_results")

print(f"[OK] fact_race_results: {fact_race_results.count():,} records")
display(fact_race_results.limit(10))

[OK] fact_race_results: 27,238 records


result_id,race_id,driver_id,constructor_id,grid_position,finish_position,position_text,points_earned,laps_completed,finish_time,time_milliseconds,fastest_lap_number,fastest_lap_rank,fastest_lap_time,fastest_lap_speed_kph,is_winner,is_podium,is_points_finish,positions_gained,race_year,race_round,race_name,race_date,driver_name,driver_code,driver_nationality,constructor_name,constructor_nationality,circuit_id,circuit_name,circuit_location,circuit_country,status_description,status_category,updated_at
1,18,1,1,1,1.0,1,10.0,58,1:34:50.616,5690616.0,39,2,1:27.452,218.3,True,True,True,0.0,2008,1,Australian Grand Prix,2008-03-16,Lewis Hamilton,HAM,British,McLaren,British,1,Albert Park Grand Prix Circuit,Melbourne,Australia,Finished,Finished,2025-12-29T18:17:31.600179Z
2,18,2,2,5,2.0,2,8.0,58,+5.478,5696094.0,41,3,1:27.739,217.586,False,True,True,3.0,2008,1,Australian Grand Prix,2008-03-16,Nick Heidfeld,HEI,German,BMW Sauber,German,1,Albert Park Grand Prix Circuit,Melbourne,Australia,Finished,Finished,2025-12-29T18:17:31.600179Z
3,18,3,3,7,3.0,3,6.0,58,+8.163,5698779.0,41,5,1:28.090,216.719,False,True,True,4.0,2008,1,Australian Grand Prix,2008-03-16,Nico Rosberg,ROS,German,Williams,British,1,Albert Park Grand Prix Circuit,Melbourne,Australia,Finished,Finished,2025-12-29T18:17:31.600179Z
4,18,4,4,11,4.0,4,5.0,58,+17.181,5707797.0,58,7,1:28.603,215.464,False,False,True,7.0,2008,1,Australian Grand Prix,2008-03-16,Fernando Alonso,ALO,Spanish,Renault,French,1,Albert Park Grand Prix Circuit,Melbourne,Australia,Finished,Finished,2025-12-29T18:17:31.600179Z
5,18,5,1,3,5.0,5,4.0,58,+18.014,5708630.0,43,1,1:27.418,218.385,False,False,True,-2.0,2008,1,Australian Grand Prix,2008-03-16,Heikki Kovalainen,KOV,Finnish,McLaren,British,1,Albert Park Grand Prix Circuit,Melbourne,Australia,Finished,Finished,2025-12-29T18:17:31.600179Z
6,18,6,3,13,6.0,6,3.0,57,,,50,14,1:29.639,212.974,False,False,True,7.0,2008,1,Australian Grand Prix,2008-03-16,Kazuki Nakajima,NAK,Japanese,Williams,British,1,Albert Park Grand Prix Circuit,Melbourne,Australia,+1 Lap,Lapped,2025-12-29T18:17:31.600179Z
7,18,7,5,17,7.0,7,2.0,55,,,54,8,1:29.534,213.224,False,False,True,10.0,2008,1,Australian Grand Prix,2008-03-16,Sébastien Bourdais,BOU,French,Toro Rosso,Italian,1,Albert Park Grand Prix Circuit,Melbourne,Australia,Engine,Mechanical,2025-12-29T18:17:31.600179Z
8,18,8,6,15,8.0,8,1.0,53,,,20,4,1:27.903,217.18,False,False,True,7.0,2008,1,Australian Grand Prix,2008-03-16,Kimi Räikkönen,RAI,Finnish,Ferrari,Italian,1,Albert Park Grand Prix Circuit,Melbourne,Australia,Engine,Mechanical,2025-12-29T18:17:31.600179Z
9,18,9,2,2,,R,0.0,47,,,15,9,1:28.753,215.1,False,False,False,,2008,1,Australian Grand Prix,2008-03-16,Robert Kubica,KUB,Polish,BMW Sauber,German,1,Albert Park Grand Prix Circuit,Melbourne,Australia,Collision,Accident,2025-12-29T18:17:31.600179Z
10,18,10,7,18,,R,0.0,43,,,23,13,1:29.558,213.166,False,False,False,,2008,1,Australian Grand Prix,2008-03-16,Timo Glock,GLO,German,Toyota,Japanese,1,Albert Park Grand Prix Circuit,Melbourne,Australia,Accident,Accident,2025-12-29T18:17:31.600179Z


## 2. Driver Career Statistics

In [0]:
# Aggregate career statistics per driver
dim_driver_career = fact_race_results \
    .groupBy("driver_id", "driver_name", "driver_code", "driver_nationality") \
    .agg(
        count("result_id").alias("total_races"),
        sum(when(col("is_winner"), 1).otherwise(0)).alias("total_wins"),
        sum(when(col("is_podium"), 1).otherwise(0)).alias("total_podiums"),
        sum(when(col("finish_position") == 2, 1).otherwise(0)).alias("total_second_places"),
        sum(when(col("finish_position") == 3, 1).otherwise(0)).alias("total_third_places"),
        sum(when(col("grid_position") == 1, 1).otherwise(0)).alias("total_pole_positions"),
        sum(when(col("fastest_lap_rank") == 1, 1).otherwise(0)).alias("total_fastest_laps"),
        sum("points_earned").alias("total_career_points"),
        spark_round(avg("finish_position"), 2).alias("avg_finish_position"),
        spark_round(avg("grid_position"), 2).alias("avg_grid_position"),
        spark_round(avg("positions_gained"), 2).alias("avg_positions_gained"),
        sum("laps_completed").alias("total_laps_completed"),
        countDistinct("constructor_id").alias("teams_raced_for"),
        min("race_year").alias("debut_year"),
        max("race_year").alias("last_season"),
        countDistinct("race_year").alias("seasons_competed")
    ) \
    .withColumn("win_percentage", spark_round((col("total_wins") / col("total_races")) * 100, 2)) \
    .withColumn("podium_percentage", spark_round((col("total_podiums") / col("total_races")) * 100, 2)) \
    .withColumn("updated_at", current_timestamp())

# Write to Gold
dim_driver_career.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{gold_db}.dim_driver_career")

print(f"[OK] dim_driver_career: {dim_driver_career.count():,} records")
display(dim_driver_career.orderBy(col("total_wins").desc()).limit(20))

[OK] dim_driver_career: 864 records


driver_id,driver_name,driver_code,driver_nationality,total_races,total_wins,total_podiums,total_second_places,total_third_places,total_pole_positions,total_fastest_laps,total_career_points,avg_finish_position,avg_grid_position,avg_positions_gained,total_laps_completed,teams_raced_for,debut_year,last_season,seasons_competed,win_percentage,podium_percentage,updated_at
1,Lewis Hamilton,HAM,British,380,105,202,57,40,104,67,4955.5,3.98,4.59,0.65,21598,3,2007,2025,19,27.63,53.16,2025-12-29T18:17:41.076825Z
30,Michael Schumacher,MSC,German,308,91,155,43,21,68,21,1566.0,3.7,4.87,1.12,16824,4,1991,2012,19,29.55,50.32,2025-12-29T18:17:41.076825Z
830,Max Verstappen,VER,Dutch,233,71,127,37,19,48,36,3301.5,3.56,4.82,1.17,12692,2,2015,2025,11,30.47,54.51,2025-12-29T18:17:41.076825Z
20,Sebastian Vettel,VET,German,300,53,122,36,33,57,38,3098.0,5.28,6.27,0.78,16426,5,2007,2022,16,17.67,40.67,2025-12-29T18:17:41.076825Z
117,Alain Prost,,French,202,51,106,35,20,33,0,798.5,2.96,4.14,0.94,10540,4,1980,1993,13,25.25,52.48,2025-12-29T18:17:41.076825Z
102,Ayrton Senna,,Brazilian,162,41,80,23,16,65,0,614.0,3.15,3.13,-0.3,8236,4,1984,1994,11,25.31,49.38,2025-12-29T18:17:41.076825Z
4,Fernando Alonso,ALO,Spanish,428,32,106,40,34,22,25,2380.0,6.83,8.67,1.48,23084,6,2001,2025,22,7.48,24.77,2025-12-29T18:17:41.076825Z
95,Nigel Mansell,,British,192,31,59,17,11,32,0,482.0,3.92,6.28,1.72,8750,4,1980,1995,15,16.15,30.73,2025-12-29T18:17:41.076825Z
328,Jackie Stewart,,British,100,27,43,11,5,17,0,360.0,2.95,4.5,1.02,5225,4,1965,1973,9,27.0,43.0,2025-12-29T18:17:41.076825Z
182,Niki Lauda,,Austrian,174,25,54,20,9,24,0,420.5,4.26,8.33,3.18,8217,8,1971,1985,13,14.37,31.03,2025-12-29T18:17:41.076825Z


## 3. Constructor Performance Metrics

In [0]:
# Aggregate performance per constructor
dim_constructor_performance = fact_race_results \
    .groupBy("constructor_id", "constructor_name", "constructor_nationality") \
    .agg(
        count("result_id").alias("total_race_entries"),
        sum(when(col("is_winner"), 1).otherwise(0)).alias("total_wins"),
        sum(when(col("is_podium"), 1).otherwise(0)).alias("total_podiums"),
        sum(when(col("grid_position") == 1, 1).otherwise(0)).alias("total_pole_positions"),
        sum("points_earned").alias("total_points"),
        spark_round(avg("finish_position"), 2).alias("avg_finish_position"),
        spark_round(avg("grid_position"), 2).alias("avg_grid_position"),
        countDistinct("driver_id").alias("total_drivers"),
        countDistinct("race_year").alias("seasons_competed"),
        min("race_year").alias("first_season"),
        max("race_year").alias("last_season")
    ) \
    .withColumn("win_rate", spark_round((col("total_wins") / col("total_race_entries")) * 100, 2)) \
    .withColumn("podium_rate", spark_round((col("total_podiums") / col("total_race_entries")) * 100, 2)) \
    .withColumn("updated_at", current_timestamp())

# Write to Gold
dim_constructor_performance.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{gold_db}.dim_constructor_performance")

print(f"[OK] dim_constructor_performance: {dim_constructor_performance.count():,} records")
display(dim_constructor_performance.orderBy(col("total_wins").desc()).limit(20))

[OK] dim_constructor_performance: 211 records


constructor_id,constructor_name,constructor_nationality,total_race_entries,total_wins,total_podiums,total_pole_positions,total_points,avg_finish_position,avg_grid_position,total_drivers,seasons_competed,first_season,last_season,win_rate,podium_rate,updated_at
6,Ferrari,Italian,2487,249,848,259,11451.27,4.74,6.26,99,76,1950,2025,10.01,34.1,2025-12-29T18:17:51.25173Z
1,McLaren,British,1971,199,542,176,7797.5,5.87,7.78,55,56,1968,2025,10.1,27.5,2025-12-29T18:17:51.25173Z
131,Mercedes,German,700,131,310,144,8154.639999999999,4.59,5.1,13,18,1954,2025,18.71,44.29,2025-12-29T18:17:51.25173Z
9,Red Bull,Austrian,836,130,297,111,8083.0,5.34,6.6,14,21,2005,2025,15.55,35.53,2025-12-29T18:17:51.25173Z
3,Williams,British,1724,114,315,128,3765.0,8.38,9.27,63,50,1975,2025,6.61,18.27,2025-12-29T18:17:51.25173Z
32,Team Lotus,British,871,45,114,61,995.0,6.52,11.04,61,29,1958,1994,5.17,13.09,2025-12-29T18:17:51.25173Z
4,Renault,French,787,35,103,51,1777.0,7.64,9.15,26,24,1977,2020,4.45,13.09,2025-12-29T18:17:51.25173Z
22,Benetton,Italian,520,27,102,15,861.5,5.86,8.64,17,16,1986,2001,5.19,19.62,2025-12-29T18:17:51.25173Z
25,Tyrrell,British,881,23,77,14,711.0,7.63,13.8,47,29,1970,1998,2.61,8.74,2025-12-29T18:17:51.25173Z
34,Brabham,British,662,23,78,26,631.0,6.77,9.96,49,22,1962,1992,3.47,11.78,2025-12-29T18:17:51.25173Z


## 4. Season Championship Aggregations

In [0]:
# Get final standings for each season (last race of each year)
# First, find the last race of each season
last_race_per_season = spark.table(f"{silver_db}.races") \
    .groupBy("race_year") \
    .agg(max("race_id").alias("final_race_id"))

# Driver Championships per Season
driver_standings = spark.table(f"{silver_db}.driver_standings")

driver_season_standings = driver_standings \
    .join(last_race_per_season, driver_standings.race_id == last_race_per_season.final_race_id) \
    .join(drivers, "driver_id") \
    .select(
        driver_standings.race_id,
        last_race_per_season.race_year.alias("season"),
        driver_standings.driver_id,
        drivers.full_name.alias("driver_name"),
        drivers.driver_nationality,
        driver_standings.championship_points,
        driver_standings.championship_position,
        driver_standings.total_wins
    )

# Add champion flag
window_spec = Window.partitionBy("season").orderBy(col("championship_position"))
agg_driver_championships = driver_season_standings \
    .withColumn("is_champion", when(col("championship_position") == 1, True).otherwise(False)) \
    .withColumn("updated_at", current_timestamp())

# Write to Gold
agg_driver_championships.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{gold_db}.agg_driver_championships")

print(f" Created agg_driver_championships: {agg_driver_championships.count():,} records")

# Show World Champions
print("\nF1 World Champions:")
display(agg_driver_championships.filter(col("is_champion")).orderBy(col("season").desc()).limit(20))

 Created agg_driver_championships: 3,213 records

F1 World Champions:


race_id,season,driver_id,driver_name,driver_nationality,championship_points,championship_position,total_wins,is_champion,updated_at
1168,2025,846,Lando Norris,British,423.0,1,7,True,2025-12-29T18:17:58.945214Z
1144,2024,830,Max Verstappen,Dutch,437.0,1,9,True,2025-12-29T18:17:58.945214Z
1120,2023,830,Max Verstappen,Dutch,575.0,1,19,True,2025-12-29T18:17:58.945214Z
1096,2022,830,Max Verstappen,Dutch,454.0,1,15,True,2025-12-29T18:17:58.945214Z
1073,2021,830,Max Verstappen,Dutch,395.5,1,10,True,2025-12-29T18:17:58.945214Z
1047,2020,1,Lewis Hamilton,British,347.0,1,11,True,2025-12-29T18:17:58.945214Z
1030,2019,1,Lewis Hamilton,British,413.0,1,11,True,2025-12-29T18:17:58.945214Z
1009,2018,1,Lewis Hamilton,British,408.0,1,11,True,2025-12-29T18:17:58.945214Z
988,2017,1,Lewis Hamilton,British,363.0,1,9,True,2025-12-29T18:17:58.945214Z
968,2016,3,Nico Rosberg,German,385.0,1,9,True,2025-12-29T18:17:58.945214Z


## 5. Constructor Championships per Season

In [0]:
constructor_standings = spark.table(f"{silver_db}.constructor_standings")

constructor_season_standings = constructor_standings \
    .join(last_race_per_season, constructor_standings.race_id == last_race_per_season.final_race_id) \
    .join(constructors, "constructor_id") \
    .select(
        constructor_standings.race_id,
        last_race_per_season.race_year.alias("season"),
        constructor_standings.constructor_id,
        constructors.constructor_name,
        constructors.constructor_nationality,
        constructor_standings.championship_points,
        constructor_standings.championship_position,
        constructor_standings.total_wins
    ) \
    .withColumn("is_champion", when(col("championship_position") == 1, True).otherwise(False)) \
    .withColumn("updated_at", current_timestamp())

# Write to Gold
constructor_season_standings.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{gold_db}.agg_constructor_championships")

print(f" Created agg_constructor_championships: {constructor_season_standings.count():,} records")

# Show Constructor Champions
print("\n Constructor Champions:")
display(constructor_season_standings.filter(col("is_champion")).orderBy(col("season").desc()).limit(20))

 Created agg_constructor_championships: 930 records

 Constructor Champions:


race_id,season,constructor_id,constructor_name,constructor_nationality,championship_points,championship_position,total_wins,is_champion,updated_at
1168,2025,1,McLaren,British,833.0,1,14,True,2025-12-29T18:18:03.576428Z
1144,2024,1,McLaren,British,666.0,1,6,True,2025-12-29T18:18:03.576428Z
1120,2023,9,Red Bull,Austrian,860.0,1,21,True,2025-12-29T18:18:03.576428Z
1096,2022,9,Red Bull,Austrian,759.0,1,17,True,2025-12-29T18:18:03.576428Z
1073,2021,131,Mercedes,German,613.5,1,9,True,2025-12-29T18:18:03.576428Z
1047,2020,131,Mercedes,German,573.0,1,13,True,2025-12-29T18:18:03.576428Z
1030,2019,131,Mercedes,German,739.0,1,15,True,2025-12-29T18:18:03.576428Z
1009,2018,131,Mercedes,German,655.0,1,11,True,2025-12-29T18:18:03.576428Z
988,2017,131,Mercedes,German,668.0,1,12,True,2025-12-29T18:18:03.576428Z
968,2016,131,Mercedes,German,765.0,1,19,True,2025-12-29T18:18:03.576428Z


## 6. Circuit Statistics

In [0]:
# Aggregate statistics per circuit
agg_circuit_statistics = fact_race_results \
    .groupBy("circuit_id", "circuit_name", "circuit_location", "circuit_country") \
    .agg(
        countDistinct("race_id").alias("total_races_hosted"),
        min("race_year").alias("first_race_year"),
        max("race_year").alias("last_race_year"),
        spark_round(avg("laps_completed"), 1).alias("avg_laps_per_race"),
        
        # Most wins at this circuit
        first(col("driver_name"), ignorenulls=True).alias("most_recent_winner"),
        first(col("constructor_name"), ignorenulls=True).alias("most_recent_winning_team"),
        
        # DNF rate
        count(when(col("status_category") != "Finished", 1)).alias("total_dnfs"),
        count("result_id").alias("total_results"),
        
        # Average positions gained
        spark_round(avg("positions_gained"), 2).alias("avg_positions_gained")
    ) \
    .withColumn("dnf_rate", spark_round((col("total_dnfs") / col("total_results")) * 100, 2)) \
    .withColumn("updated_at", current_timestamp())

# Write to Gold
agg_circuit_statistics.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable(f"{gold_db}.agg_circuit_statistics")

print(f" Created agg_circuit_statistics: {agg_circuit_statistics.count():,} records")
display(agg_circuit_statistics.orderBy(col("total_races_hosted").desc()).limit(15))

 Created agg_circuit_statistics: 77 records


circuit_id,circuit_name,circuit_location,circuit_country,total_races_hosted,first_race_year,last_race_year,avg_laps_per_race,most_recent_winner,most_recent_winning_team,total_dnfs,total_results,avg_positions_gained,dnf_rate,updated_at
14,Autodromo Nazionale di Monza,Monza,Italy,75,1950,2025,38.3,Rubens Barrichello,Brawn,1332,1856,3.37,71.77,2025-12-29T18:18:11.70672Z
6,Circuit de Monaco,Monte-Carlo,Monaco,71,1950,2025,49.0,Jenson Button,Brawn,1332,1684,3.24,79.1,2025-12-29T18:18:11.70672Z
9,Silverstone Circuit,Silverstone,UK,60,1950,2025,46.2,Sebastian Vettel,Red Bull,1039,1456,2.9,71.36,2025-12-29T18:18:11.70672Z
13,Circuit de Spa-Francorchamps,Spa,Belgium,58,1950,2025,29.0,Kimi Räikkönen,Ferrari,798,1278,2.82,62.44,2025-12-29T18:18:11.70672Z
7,Circuit Gilles Villeneuve,Montreal,Canada,44,1978,2025,49.1,Robert Kubica,BMW Sauber,779,1072,3.05,72.67,2025-12-29T18:18:11.70672Z
18,Autódromo José Carlos Pace,São Paulo,Brazil,42,1973,2025,49.4,Mark Webber,Red Bull,661,957,2.54,69.07,2025-12-29T18:18:11.70672Z
20,Nürburgring,Nürburg,Germany,41,1951,2020,26.9,Mark Webber,Red Bull,659,976,3.83,67.52,2025-12-29T18:18:11.70672Z
11,Hungaroring,Budapest,Hungary,40,1986,2025,56.7,Lewis Hamilton,McLaren,663,931,2.08,71.21,2025-12-29T18:18:11.70672Z
70,Red Bull Ring,Spielberg,Austria,39,1970,2025,45.6,Michael Schumacher,Ferrari,684,921,3.04,74.27,2025-12-29T18:18:11.70672Z
10,Hockenheimring,Hockenheim,Germany,37,1970,2019,35.6,Lewis Hamilton,McLaren,671,935,3.33,71.76,2025-12-29T18:18:11.70672Z


## 7. Driver Season Performance (Year-over-Year)

In [0]:
# Driver performance by season
agg_driver_season_performance = fact_race_results \
    .groupBy("race_year", "driver_id", "driver_name", "driver_nationality") \
    .agg(
        count("result_id").alias("races"),
        sum("points_earned").alias("total_points"),
        sum(when(col("is_winner"), 1).otherwise(0)).alias("wins"),
        sum(when(col("is_podium"), 1).otherwise(0)).alias("podiums"),
        sum(when(col("grid_position") == 1, 1).otherwise(0)).alias("poles"),
        spark_round(avg("finish_position"), 2).alias("avg_finish"),
        spark_round(avg("grid_position"), 2).alias("avg_grid"),
        first("constructor_name").alias("main_team")
    ) \
    .withColumn("points_per_race", spark_round(col("total_points") / col("races"), 2)) \
    .withColumn("updated_at", current_timestamp())

# Write to Gold
agg_driver_season_performance.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .partitionBy("race_year") \
    .saveAsTable(f"{gold_db}.agg_driver_season_performance")

print(f" Created agg_driver_season_performance: {agg_driver_season_performance.count():,} records")

 Created agg_driver_season_performance: 3,232 records


## 8. Constructor Season Performance

In [0]:
# Constructor performance by season
agg_constructor_season_performance = fact_race_results \
    .groupBy("race_year", "constructor_id", "constructor_name", "constructor_nationality") \
    .agg(
        count("result_id").alias("race_entries"),
        sum("points_earned").alias("total_points"),
        sum(when(col("is_winner"), 1).otherwise(0)).alias("wins"),
        sum(when(col("is_podium"), 1).otherwise(0)).alias("podiums"),
        sum(when(col("grid_position") == 1, 1).otherwise(0)).alias("poles"),
        spark_round(avg("finish_position"), 2).alias("avg_finish"),
        countDistinct("driver_id").alias("drivers_used")
    ) \
    .withColumn("points_per_entry", spark_round(col("total_points") / col("race_entries"), 2)) \
    .withColumn("updated_at", current_timestamp())

# Write to Gold
agg_constructor_season_performance.write.format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .partitionBy("race_year") \
    .saveAsTable(f"{gold_db}.agg_constructor_season_performance")

print(f" Created agg_constructor_season_performance: {agg_constructor_season_performance.count():,} records")

 Created agg_constructor_season_performance: 1,121 records


## Key Insights Preview

In [0]:
print(" TOP 10 DRIVERS BY WINS (All Time)")
print("-" * 50)
display(
    spark.table(f"{gold_db}.dim_driver_career")
    .select("driver_name", "total_wins", "total_podiums", "total_pole_positions", "total_career_points")
    .orderBy(col("total_wins").desc())
    .limit(10)
)

 TOP 10 DRIVERS BY WINS (All Time)
--------------------------------------------------


driver_name,total_wins,total_podiums,total_pole_positions,total_career_points
Lewis Hamilton,105,202,104,4955.5
Michael Schumacher,91,155,68,1566.0
Max Verstappen,71,127,48,3301.5
Sebastian Vettel,53,122,57,3098.0
Alain Prost,51,106,33,798.5
Ayrton Senna,41,80,65,614.0
Fernando Alonso,32,106,22,2380.0
Nigel Mansell,31,59,32,482.0
Jackie Stewart,27,43,17,360.0
Niki Lauda,25,54,24,420.5


In [0]:
print(" TOP 10 CONSTRUCTORS BY WINS (All Time)")
print("-" * 50)
display(
    spark.table(f"{gold_db}.dim_constructor_performance")
    .select("constructor_name", "total_wins", "total_podiums", "total_pole_positions", "total_points")
    .orderBy(col("total_wins").desc())
    .limit(10)
)

 TOP 10 CONSTRUCTORS BY WINS (All Time)
--------------------------------------------------


constructor_name,total_wins,total_podiums,total_pole_positions,total_points
Ferrari,249,848,259,11451.27
McLaren,199,542,176,7797.5
Mercedes,131,310,144,8154.639999999999
Red Bull,130,297,111,8083.0
Williams,114,315,128,3765.0
Team Lotus,45,114,61,995.0
Renault,35,103,51,1777.0
Benetton,27,102,15,861.5
Tyrrell,23,77,14,711.0
Brabham,23,78,26,631.0
