# Gold Layer – Lap Performance

## Objective
This notebook creates the `gold.lap_performance` table by combining lap times, telemetry, and weather data.  
It focuses on building a comprehensive lap-level dataset that enables performance analytics.

## Steps
1. Load Silver tables: `lap_times`, `telemetry_data`, `weather_data`.  
2. Join on `session_key` and `lap_number` where applicable.  
3. Compute key performance indicators (KPIs):
   - Average sector times  
   - Speed metrics (max, min, avg per lap)  
   - Tyre stint duration and compound  
   - Weather impact (track temperature, rainfall)  
4. Add derived flags:
   - `is_fastest_lap` per race  
   - `pit_lap` indicator  
5. Remove duplicates and null handling.  
6. Write the final enriched dataset into the Gold layer as `gold.lap_performance`.  
7. Optimize table with ZORDER on `(session_key, driver_number, lap_number)`.  


In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# 1. Load Silver tables
lap_times = spark.table("silver.lap_times")
telemetry = spark.table("silver.telemetry_data")
weather = spark.table("silver.weather_data")

# 2. Join lap_times with telemetry (driver, lap, session)
lap_perf = (
    lap_times.alias("lt")
    .join(
        telemetry.alias("td"),
        on=[
            "driver", "lap_number", "session_key"
        ],
        how="left"
    )
)

# 3. Aggregate telemetry metrics per lap
telemetry_agg = (
    telemetry.groupBy("driver", "lap_number", "session_key")
    .agg(
        F.avg("Speed").alias("avg_speed"),
        F.max("Speed").alias("max_speed"),
        F.min("Speed").alias("min_speed"),
        F.avg("RPM").alias("avg_rpm"),
        F.avg("Throttle").alias("avg_throttle"),
        F.avg("Brake").alias("avg_brake")
    )
)

# 4. Join aggregated telemetry to lap_times
lap_perf = (
    lap_times
    .join(
        telemetry_agg,
        on=["driver", "lap_number", "session_key"],
        how="left"
    )
)

# 5. Add weather conditions (closest time match per session)
weather_agg = (
    weather.groupBy("session_key")
    .agg(
        F.avg("TrackTemp").alias("avg_track_temp"),
        F.avg("AirTemp").alias("avg_air_temp"),
        F.avg("Humidity").alias("avg_humidity"),
        F.avg("Rainfall").alias("avg_rainfall")
    )
)

lap_perf = lap_perf.join(weather_agg, on="session_key", how="left")

# 6. Derived flags
# Window by session_key for fastest lap
w = Window.partitionBy("session_key").orderBy(F.col("lap_time").asc_nulls_last())
lap_perf = lap_perf.withColumn("row_num", F.row_number().over(w))
lap_perf = lap_perf.withColumn("is_fastest_lap", F.when(F.col("row_num") == 1, True).otherwise(False))
lap_perf = lap_perf.drop("row_num")

# Pit stop flag
lap_perf = lap_perf.withColumn("pit_lap", F.when(F.col("pit_in_time").isNotNull() | F.col("pit_out_time").isNotNull(), True).otherwise(False))

# 7. Clean duplicates
lap_perf = lap_perf.dropDuplicates()

# 8. Write to Gold layer
(
    lap_perf.write.format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .saveAsTable("gold.lap_performance")
)

# 9. Optimize
spark.sql("OPTIMIZE gold.lap_performance ZORDER BY (session_key, driver, lap_number)")
