# Databricks notebook source
# MAGIC %md
# ### 02. Transform and Load Bronze Table
# In this notebook:
# - Load raw FastF1 lap data (from Day 1 cache)
# - Convert Pandas DataFrame → Spark DataFrame
# - Handle unsupported dtypes (timedelta64)
# - Write to Delta Bronze table in f1analytics.bronze schema


# #### 1. Import libraries and reload FastF1 session


In [0]:
%pip install fastf1 matplotlib pandas

In [0]:
%restart_python

In [0]:
import fastf1
import pandas as pd
from pyspark.sql.functions import col

# Enable cache (same as Day 1)
fastf1.Cache.enable_cache("/Workspace/Users/niranjan.482000@gmail.com/F1-Race-Analytics/cache")

# Load one session (Monza GP 2023 - Race as example)
session = fastf1.get_session(2023, "Monza", "R")
session.load()

# Get laps data
df = session.laps
print("Pandas DataFrame shape:", df.shape)
df.head(3)


In [0]:
# COMMAND ----------
# MAGIC %md
# #### 2. Convert Pandas → Spark


In [0]:
# Create Spark DataFrame
spark_df = spark.createDataFrame(df)
spark_df.printSchema()

In [0]:
# MAGIC %md
# MAGIC ## Convert timedeltas in pandas then create Spark DF

In [0]:
# 1) Convert timedelta columns to milliseconds in pandas
timedelta_cols = [
    "Time","LapTime","PitOutTime","PitInTime",
    "Sector1Time","Sector2Time","Sector3Time",
    "Sector1SessionTime","Sector2SessionTime","Sector3SessionTime",
    "LapStartTime"
]

# df is the pandas DataFrame from FastF1 (session.laps)
import pandas as pd
for c in timedelta_cols:
    if c in df.columns:
        # .dt may be NaT safe; result is float ms (NaN where missing)
        df[c] = df[c].dt.total_seconds() * 1000.0

# ensure datetime columns are proper dtype
if "LapStartDate" in df.columns:
    df["LapStartDate"] = pd.to_datetime(df["LapStartDate"])


In [0]:
# 2) Create Spark DataFrame from pandas (now no INTERVAL types)
spark_df = spark.createDataFrame(df)
spark_df.printSchema()


In [0]:
# MAGIC %md
# MAGIC ## Rename to snake_case and cast ms -> long


In [0]:
import re
from pyspark.sql.functions import col

def to_snake(s: str) -> str:
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', s)
    s2 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1)
    return s2.lower()

# rename columns
for old in spark_df.columns:
    spark_df = spark_df.withColumnRenamed(old, to_snake(old))

# cast millisecond columns to long
for c in timedelta_cols:
    newc = to_snake(c)
    if newc in spark_df.columns:
        spark_df = spark_df.withColumn(newc, col(newc).cast("long"))

spark_df.printSchema()


In [0]:
# MAGIC %md
# MAGIC ## Write to bronze delta table and validate


In [0]:
spark_df.write.format("delta").mode("overwrite").saveAsTable("f1_catalog.bronze.lap_times")

# quick validation
display(spark.sql("SELECT COUNT(*) AS cnt FROM f1_catalog.bronze.lap_times"))
display(spark.sql("SELECT driver, lap_number, lap_time, position, compound FROM f1_catalog.bronze.lap_times LIMIT 10"))