In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
df_pit_stops = spark.read.csv('s3://columbia-gr5069-main/raw/pit_stops.csv', header = True)
df_drivers = spark.read.csv('s3://columbia-gr5069-main/raw/drivers.csv', header = True)
df_results = spark.read.csv('s3://columbia-gr5069-main/raw/results.csv', header = True)
df_races= spark.read.csv('s3://columbia-gr5069-main/raw/races.csv', header = True)
df_status = spark.read.csv('s3://columbia-gr5069-main/raw/status.csv', header = True)
df_constructors = spark.read.csv('s3://columbia-gr5069-main/raw/constructors.csv', header = True)

### 1. What was the average time each driver spent at the pit stop for each race?

In [0]:
display(df_pit_stops.limit(10))

In [0]:
avg_pit_stop_times = df_pit_stops.groupBy("raceId", "driverId") \
                              .agg(
                                  F.avg("duration").alias("avg_duration"),
                                  F.avg("milliseconds").alias("avg_milliseconds"),
                                  F.count("stop").alias("num_pit_stops")
                              ) \
                              .orderBy(F.col("raceId").cast("int"), "avg_milliseconds")

display(avg_pit_stop_times.limit(10))

### 2. Rank the average time spent at the pit stop in order of who won each race

In [0]:
display(df_results.limit(10))

In [0]:
avg_pit_stop_times = df_pit_stops.groupBy("raceId", "driverId") \
                          .agg(
                              F.avg("duration").alias("avg_duration"),
                              F.avg("milliseconds").alias("avg_milliseconds"),
                              F.count("stop").alias("num_pit_stops")
                          )

pit_stops_with_results = avg_pit_stop_times.join(
    df_results.select("raceId", "driverId", "position", "positionText", "positionOrder"),
    on=["raceId", "driverId"],
    how="inner"
)

pit_stops_with_results = pit_stops_with_results.withColumn(
    "finished_race",
    F.when((F.col("positionText") != "R") & (F.col("position").isNotNull()), True).otherwise(False)
)

ranked_pit_stops = pit_stops_with_results.orderBy(
    F.col("raceId").cast("int"),
    F.col("positionOrder").cast("int")
)

display(ranked_pit_stops.limit(10))

**My approach to handling drivers who did not finish the race:**
1. I created a boolean column finished_race that identifies drivers who finished (position is not null and positionText is not "R") versus those who didn't finish.
2. I included both finishers and non-finishers in the main analysis, keeping them ordered by their positionOrder value, which ensures that even non-finishers are ranked correctly based on how far they progressed in the race before retiring.

### 3. Insert the missing code (e.g: ALO for Alonso) for drivers based on the 'drivers' dataset

In [0]:
display(df_drivers.limit(10))

### 4. Who is the youngest and oldest driver for each race? Create a new column called “Age”

### 5. For a given race, which driver has the most wins and losses?

### 6. Continue exploring the data by answering your own question.