In [0]:
from pyspark.sql.functions import avg
from pyspark.sql.functions import avg, rank
from pyspark.sql.window import Window
from pyspark.sql.functions import expr, upper, substring, when, col, count,asc
from pyspark.sql.functions import col, year, datediff, current_date
from pyspark.sql.functions import min, max
from pyspark.sql import functions as F
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler

# Question 1: What was the average time each driver spent at the pit stop for each race?

In [0]:
df = spark.read.csv("s3://columbia-gr5069-main/raw/pit_stops.csv", header=True, inferSchema=True)
df.show(50)

In [0]:
avg_pitstop_time = df.groupBy("raceId", "driverId").agg(avg("milliseconds").alias("avg_pitstop"))
avg_pitstop_time.show(50)

# Question 2: Rank the average time spent at the pit stop in order of who won each race

In [0]:
df1 = spark.read.option("header", True)\
                .option("inferSchema", True)\
                .option("multiLine", True)\
                .csv("s3://columbia-gr5069-main/raw/results.csv")

df1.show(30)

In [0]:
joined = avg_pitstop_time.join(
   df1.select("raceId", "driverId", "positionOrder", "positionText"),
    on=["raceId", "driverId"],
    how="inner"
)
finish = joined.filter(~col("positionText").isin("R", "N", "D"))

In [0]:
ranked = finish.orderBy("raceId", asc("positionOrder"))
ranked.show(50)

I excluded people who are not classified, disqulified and who did not finish the race for clearer and better comparisons for other racers in each race. Their pit stop time were not used in my analysis.

# Question 3: Insert the missing code (e.g: ALO for Alonso) for drivers based on the 'drivers' dataset

In [0]:
df2 = spark.read.csv("s3://columbia-gr5069-main/raw/drivers.csv", header=True, inferSchema=True)
df2.show(50)

In [0]:
df2 = df2.withColumn("code", 
                     when((col("code").isNull()) | (col("code") == "\\N"),
                          upper(substring(col("surname"), 1, 3)))
                     .otherwise(col("code")))
df2.show(50)

I notice the code is the capitalized first 3 letter of surname, and there are rows with "N" in code column. So I replaced them with according first 3 letter of surname and keep other value that have already been the correct code.

# Question 4: Who is the youngest and oldest driver for each race? Create a new column called “Age”

In [0]:
df2=df2.withColumn("dob", col("dob").cast("date"))
df2 = df2.withColumn("Age", year(current_date()) - year(col("dob"))) # so for the age, I use the current year minus the birthday year to get the driver's age.
df2.show(50)

In [0]:
df_age = df1.join(df2, on="driverId", how="left")
window_spec = Window.partitionBy("raceId")
df_labeled = df_age.withColumn("youngest_age", min(col("Age")).over(window_spec)) \
    .withColumn("oldest_age", max(col("Age")).over(window_spec)) \
    .withColumn(
        "age_group",
        when(col("Age") == col("youngest_age"), "youngest")
        .when(col("Age") == col("oldest_age"), "oldest")
    )
df_filtered = df_labeled.filter(col("age_group").isNotNull())
df_filtered.select("raceId", "driverId", "Age", "age_group").show(10)

# Question 5: For a given race, which driver has the most wins and losses?

In [0]:
completed = df1.filter(~F.col("positionText").isin("R", "N", "D"))
window_spec1 = Window.partitionBy("driverId").orderBy("raceId").rowsBetween(Window.unboundedPreceding, -1)
with_flags = completed.withColumn("is_win", F.when(F.col("positionOrder") == 1, 1).otherwise(0)) \
                      .withColumn("is_loss", F.when(F.col("positionOrder") > 1, 1).otherwise(0))
winslosses_history = with_flags.withColumn("past_wins", F.sum("is_win").over(window_spec)) \
                               .withColumn("past_losses", F.sum("is_loss").over(window_spec))
winslosses_history.select("raceId", "driverId", "past_wins", "past_losses").show()

# Question 6: Continue exploring the data by answering your own question. 

In [0]:
# I'm trying to figure out the relationship of position at the start of the race(Grid) and the position at the end of the race(position order).
assembler = VectorAssembler(inputCols=["grid"], outputCol="features")
assembled = assembler.transform(completed.select("grid", "positionOrder"))
lr = LinearRegression(featuresCol="features", labelCol="positionOrder")
model = lr.fit(assembled)
summary = model.summary
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)
print("R^2:", summary.r2)
print("RMSE:", summary.rootMeanSquaredError)
summary.residuals.show()

The regression shows that grid has almost no impact on race result in this dataset, with a very low R² of 0.00049 and a small coefficient of 0.023, which means starting farther back slightly increases finish position. However, the effect is negligible, and the model’s predictions are off by an average of 7.5 positions (RMSE), suggesting race outcomes are influenced more by other factors like pit stops, DNFs and so on instead of starting position.