In [0]:
# Author: Jay Jun
# Date: March 22, 2025
# Project: Applied Data Homework #3
# Purpose: Answer question using F1 data on the AWS S3 utilizing Databricks using either Pandas, R , or PySpark 

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, round, avg, concat_ws, rank, row_number, round, col, when, udf, year, datediff, count, lit, when
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from pyspark.sql.types import StringType

spark = SparkSession.builder \
    .appName("F1 Data Analysis") \
    .getOrCreate()

# Load the pit stops data
pit_stops_df = spark.read.csv("s3://columbia-gr5069-main/raw/pit_stops.csv", header=True, inferSchema=True)

# Load the drivers data for reference
drivers_df = spark.read.csv("s3://columbia-gr5069-main/raw/drivers.csv", header=True, inferSchema=True)

# Load the results data for reference 
results_df = spark.read.csv("s3://columbia-gr5069-main/raw/results.csv", header=True, inferSchema=True)

# Load the races data for reference
races_df = spark.read.csv("s3://columbia-gr5069-main/raw/races.csv", header=True, inferSchema=True)

# Load the status data for reference 
status_df = spark.read.csv("s3://columbia-gr5069-main/raw/status.csv", header=True, inferSchema=True)

# Load the qualifying data for reference 
qualifying_df = spark.read.csv("s3://columbia-gr5069-main/raw/qualifying.csv", header=True, inferSchema=True)


In [0]:
# The first question asks "What was the average time each driver spent at the pit stop for each race?"

#Joining pit stops and drivers data (specfically selecting columns that are needed)

drivers_df = drivers_df.select("driverId", "forename", "surname")
pit_stops_with_names = pit_stops_df.join(drivers_df, "driverId")

# Select only the necessary columns from drivers
drivers_df = drivers_df.select("driverId", "forename", "surname")

# Calculate average pit stop duration for each driver in each race
avg_pit_stop_times = pit_stops_df.groupBy("raceId", "driverId") \
    .agg(round(avg("duration"), 3).alias("avg_duration"))

# Join with drivers_df to add driver names
result = avg_pit_stop_times.join(drivers_df, "driverId") \
    .withColumn("driver_name", concat_ws(" ", trim(col("forename")), trim(col("surname")))) \
    .select("raceId", "driver_name", "avg_duration") \
    .orderBy("raceId", "avg_duration")

# Show the results
result.show(100, False)


In [0]:
# The second question asks us to "rank the average time spent at the pit stop in order of who won each race"

# Join results with pit stops
joined_df = results_df.join(pit_stops_df, on=["raceId", "driverId"], how="inner")

# Compute average pit stop time per driver per race and position
avg_pit_df = joined_df.groupBy("raceId", "driverId", "positionOrder").agg(
    avg(pit_stops_df.milliseconds).alias("avg_pit_stop_time")
)

# Define window by raceId, ordered by finishing position
window_spec = Window.partitionBy("raceId").orderBy("positionOrder")

# Rank drivers by their finish within each race
ranked_df = avg_pit_df.withColumn("avg_pit_stop_time", round(avg_pit_df["avg_pit_stop_time"], 2)) \
                      .withColumn("finishing_rank", row_number().over(window_spec))
# Sort and display the final result
final_df = ranked_df.orderBy("raceId", "finishing_rank")
final_df.show(40)


In [0]:
# The third question asks insert the missing code (e.g: ALO for Alonso) for drivers based on the 'drivers' dataset

# Define a user defined function (UDF) to generate the missing codes
def generate_code(surname):
    return surname[:3].upper()

generate_code_udf = udf(generate_code, StringType())

# Apply the (UDF) to fill in the missing codes
drivers_df_filled = drivers_df.withColumn(
    "code",
    when(col("code") == "\\N", generate_code_udf(col("surname"))).otherwise(col("code"))
)

# Show the results
drivers_df_filled.select("driverId", "forename", "surname", "code").show(10)

In [0]:
#The fourth question "Who is the youngest and oldest driver for each race? Create a new column called “Age”

# The fourth question "Who is the youngest and oldest driver for each race? Create a new column called “Age”"

# Join drivers and results dataframes
joined_df = drivers_df.join(results_df, "driverId").join(races_df, "raceId")

# Calculate age at the time of the race
age_df = joined_df.withColumn(
    "Age",
    round(
        when(
            (year(col("date")) > year(col("dob"))) |
            ((year(col("date")) == year(col("dob"))) & (col("date") >= col("dob"))),
            datediff(col("date"), col("dob")) / 365
        ).otherwise(datediff(col("date"), col("dob")) / 365 - 1)
    )
)

# Find youngest and oldest drivers for each race
window_spec = Window.partitionBy("raceId")
result_df = age_df.withColumn("Youngest_Driver", F.min("Age").over(window_spec)) \
                  .withColumn("Oldest_Driver", F.max("Age").over(window_spec)) \
                  .select("raceId", "date", "Youngest_Driver", "Oldest_Driver") \
                  .distinct() \
                  .orderBy("raceId")

# Show the results
display(result_df)

In [0]:
# Queston 5 asks "For a given race, which driver has the most wins and losses?"

# Identify all statusIds that represent a "Did Not Finish" (DNF)
# Usually any description that is NOT "Finished" is a DNF
dnf_status_ids = status_df.filter(col("status") != "Finished").select("statusId").rdd.flatMap(lambda x: x).collect()

# Filter to races before the given race
target_race_id = 843
previous_races_df = results_df.filter(col("raceId") < target_race_id)

# Create labeled columns
labeled_df = previous_races_df.withColumn(
    "win", when(col("positionOrder") == 1, 1).otherwise(0)
).withColumn(
    "not_completed", when(col("statusId").isin(dnf_status_ids), 1).otherwise(0)
).withColumn(
    "completed_not_won", when(
        (col("positionOrder") > 1) & (~col("statusId").isin(dnf_status_ids)), 1
    ).otherwise(0)
).withColumn(
    "total_participated", lit(1)
)

# Aggregate
summary_df = labeled_df.groupBy("driverId").agg(
    count(when(col("win") == 1, True)).alias("wins"),
    count(when(col("completed_not_won") == 1, True)).alias("completed_not_won"),
    count(when(col("not_completed") == 1, True)).alias("not_completed"),
    count(col("total_participated")).alias("total_races")
)

# Join with driver names
final_df = summary_df.join(drivers_df.select("driverId", "surname"), on="driverId", how="left")

# Display results
final_df.select("surname", "wins", "completed_not_won", "not_completed", "total_races") \
    .orderBy("wins", ascending=False).show()

In [0]:
# A queston I will be answeing will be "Which driver improves the most positions, on average, from their qualifying position to thier race finishing position?"

# Join on raceId and driverId to get both qualifying and result info
joined_df = qualifying_df.join(
    results_df.select("raceId", "driverId", "positionOrder"),
    on=["raceId", "driverId"],
    how="inner"
)

# Calculate position change (positive = improved positions)
position_diff_df = joined_df.withColumn(
    "position_gain", col("position") - col("positionOrder")
)

# Group by driver and calculate average gain, rounding to the nearest whole number
avg_gain_df = position_diff_df.groupBy("driverId").agg(
    round(avg("position_gain")).alias("avg_position_gain")
)

# Join with driver names
final_gain_df = avg_gain_df.join(
    drivers_df.select("driverId", "surname"),
    on="driverId",
    how="left"
)

# Order by most average positions gained
display(
    final_gain_df.select("surname", "avg_position_gain")
    .orderBy("avg_position_gain", ascending=False)
)