In [0]:
# Author: Aria Narang
# Date: March 26, 2025
# Project: Applied Data Science Take Home Excercise #2
# Datasets used: raw/pit_stops.csv, raw/drivers.csv, raw/results.csv, raw/races.csv, raw/status.csv, raw/qualifying.csv

In [0]:
# Import necessary PySpark SQL functions and Window functionality
# - col: For column references and operations
# - year, current_date: For date manipulations
# - avg, min, max, count: Aggregate functions for statistical analysis
# - when: For conditional logic (similar to IF-THEN-ELSE)
# - rank, dense_rank: Window functions for ranking results
# - Window: For defining partitioning and ordering in window operations
from pyspark.sql.functions import col, year, current_date, avg, min, max, count, when, rank, dense_rank, lit, concat, datediff, substring, upper, stddev, sum
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType, DateType

In [0]:
#Read in the two datasets that are necesary for this take home assignment
df_laptimes = spark.read.csv('s3://columbia-gr5069-main/raw/lap_times.csv', header=True)
df_drivers = spark.read.csv('s3://columbia-gr5069-main/raw/drivers.csv', header=True)
df_races = spark.read.csv('s3://columbia-gr5069-main/raw/races.csv', header=True)
df_results = spark.read.csv('s3://columbia-gr5069-main/raw/results.csv', header=True)
df_pit_stops = spark.read.csv('s3://columbia-gr5069-main/raw/pit_stops.csv', header=True)
df_status = spark.read.csv('s3://columbia-gr5069-main/raw/status.csv', header=True)
df_qualifying = spark.read.csv("s3://columbia-gr5069-main/raw/qualifying.csv", header=True, inferSchema=True)



In [0]:
#Question 1: What was the average time each driver spent at the pit stop for each race?

# Calculate the average pit stop time per driver per race
avg_pit_times = df_pit_stops.groupBy("raceId", "driverId") \
                           .agg(avg("milliseconds").alias("avg_pit_time_ms")) \
                           .withColumn("avg_pit_time_seconds", col("avg_pit_time_ms") / 1000)

# Join with driver names for better readability
pit_stops_with_names = avg_pit_times.join(
    df_drivers.select("driverId", "forename", "surname"),
    on="driverId"
).withColumn("driver_name", concat(col("forename"), lit(" "), col("surname")))

# Join with race names
final_result = pit_stops_with_names.join(
    df_races.select("raceId", "name", "year"),
    on="raceId"
)

# Select and order the columns for display
q1_result = final_result.select(
    "year", "name", "driver_name", "avg_pit_time_seconds"
).orderBy("year", "name", "avg_pit_time_seconds")

# Display sample of results
display(q1_result.orderBy("year", "name", "avg_pit_time_seconds").limit(5))
print("Showing 5 of", q1_result.count(), "total records")



year,name,driver_name,avg_pit_time_seconds
2011,Abu Dhabi Grand Prix,Pastor Maldonado,16.549
2011,Abu Dhabi Grand Prix,Bruno Senna,18.057
2011,Abu Dhabi Grand Prix,Lewis Hamilton,19.3945
2011,Abu Dhabi Grand Prix,Nico Rosberg,19.846
2011,Abu Dhabi Grand Prix,Michael Schumacher,20.149


Showing 5 of 4931 total records


In [0]:
# Question 2: Rank the average time spent at the pit stop in order of who won each race

from pyspark.sql.functions import col, avg, concat, lit, rank
from pyspark.sql.window import Window

# Identify race winners (position = 1)
race_winners = df_results.filter(col("position") == 1) \
                        .select("raceId", "driverId") \
                        .withColumnRenamed("driverId", "winner_id")

# Calculate average pit stop time for each driver in each race
avg_pit_times = df_pit_stops.groupBy("raceId", "driverId") \
                           .agg(avg("milliseconds").alias("avg_pit_time_ms")) \
                           .withColumn("avg_pit_time_seconds", col("avg_pit_time_ms") / 1000)

# Join with race winners data
pit_times_with_winners = avg_pit_times.join(race_winners, on="raceId")

# Join with driver information for names
pit_times_with_names = pit_times_with_winners.join(
    df_drivers.select("driverId", "forename", "surname"),
    on="driverId"
).withColumn("driver_name", concat(col("forename"), lit(" "), col("surname")))

# Create a window function to rank pit stop times within each race
window_spec = Window.partitionBy("raceId").orderBy("avg_pit_time_ms")
ranked_pit_times = pit_times_with_names.withColumn("pit_time_rank", rank().over(window_spec))

# Mark the race winner
ranked_pit_times = ranked_pit_times.withColumn(
    "is_winner", 
    when(col("driverId") == col("winner_id"), lit("Race Winner")).otherwise(lit(""))
)

# Join with race information
final_result = ranked_pit_times.join(
    df_races.select("raceId", "name", "year"),
    on="raceId"
)

# Select and order columns for display
q2_result = final_result.select(
    "year", 
    "name", 
    "driver_name", 
    "avg_pit_time_seconds", 
    "pit_time_rank", 
    "is_winner"
).orderBy("year", "name", "pit_time_rank")

# Display sample of results
display(q2_result.orderBy("name", "pit_time_rank").limit(5))
print("Showing 5 of", q2_result.count(), "total records")

# Add a note about drivers who didn't finish or make pit stops
print("Note: Drivers who didn't finish the race or didn't make pit stops are not included in this analysis.")

year,name,driver_name,avg_pit_time_seconds,pit_time_rank,is_winner
2020,70th Anniversary Grand Prix,Antonio Giovinazzi,27.7925,1,
2020,70th Anniversary Grand Prix,Esteban Ocon,28.093,2,
2020,70th Anniversary Grand Prix,Lance Stroll,28.1045,3,
2020,70th Anniversary Grand Prix,Alexander Albon,28.1375,4,
2020,70th Anniversary Grand Prix,Daniil Kvyat,28.2005,5,


Showing 5 of 4931 total records
Note: Drivers who didn't finish the race or didn't make pit stops are not included in this analysis.


In [0]:
# Question 3: Insert the missing code (e.g: ALO for Alonso) for drivers based on the 'drivers' dataset

from pyspark.sql.functions import col, upper, substring, when, lit

# Check for drivers with missing codes 
missing_codes = df_drivers.filter(
    (col("code").isNull()) | 
    (col("code") == "") | 
    (col("code") == "\\N") |  # This might be how NULL is represented in your data
    (col("code") == "N")
)

# Generate three-letter codes for ALL drivers
df_drivers_with_codes = df_drivers.withColumn(
    "generated_code",
    when((col("code").isNull()) | (col("code") == "") | (col("code") == "\\N") | (col("code") == "N"),
         upper(substring(col("surname"), 1, 3))
    ).otherwise(col("code"))
)

# Now let's specifically check which drivers got NEW codes
drivers_with_new_codes = df_drivers_with_codes.filter(
    col("code") != col("generated_code")
)

# Display sample of results
print("Drivers with newly generated codes (sample):")
display(drivers_with_new_codes.select(
    "driverId", 
    "forename", 
    "surname", 
    "code", 
    "generated_code"
).orderBy("driverId").limit(5))

Drivers with newly generated codes (sample):


driverId,forename,surname,code,generated_code
100,Érik,Comas,\N,COM
101,David,Brabham,\N,BRA
102,Ayrton,Senna,\N,SEN
103,Éric,Bernard,\N,BER
104,Christian,Fittipaldi,\N,FIT


In [0]:
# Question 4: Who is the youngest and oldest driver for each race? Create a new column called “Age”

from pyspark.sql.functions import col, datediff, year, lit, concat, rank
from pyspark.sql.window import Window
from pyspark.sql.types import IntegerType

# First, get all driver-race combinations from results
race_participants = df_results.select("raceId", "driverId").distinct()

# Get race dates from the races table
race_dates = df_races.select("raceId", "date", "name", "year")

# Join races and drivers to calculate age at race time
driver_ages = race_participants.join(race_dates, on="raceId") \
                              .join(df_drivers.select("driverId", "dob", "forename", "surname"), on="driverId") \
                              .withColumn("race_date", col("date").cast("date")) \
                              .withColumn("birth_date", col("dob").cast("date")) \
                              .withColumn("age_years", (datediff(col("race_date"), col("birth_date")) / 365.25).cast(IntegerType())) \
                              .withColumn("driver_name", concat(col("forename"), lit(" "), col("surname")))

# Define windows for finding youngest and oldest drivers per race
window_youngest = Window.partitionBy("raceId").orderBy("age_years")
window_oldest = Window.partitionBy("raceId").orderBy(col("age_years").desc())

# Find youngest driver for each race
youngest_drivers = driver_ages.withColumn("rank", rank().over(window_youngest)) \
                             .filter(col("rank") == 1) \
                             .select("raceId", "driver_name", "age_years") \
                             .withColumnRenamed("driver_name", "youngest_driver") \
                             .withColumnRenamed("age_years", "youngest_age")

# Find oldest driver for each race
oldest_drivers = driver_ages.withColumn("rank", rank().over(window_oldest)) \
                           .filter(col("rank") == 1) \
                           .select("raceId", "driver_name", "age_years") \
                           .withColumnRenamed("driver_name", "oldest_driver") \
                           .withColumnRenamed("age_years", "oldest_age")

# Combine youngest and oldest driver information
age_results = youngest_drivers.join(oldest_drivers, on="raceId") \
                             .join(race_dates, on="raceId") \
                             .select("year", "name", "youngest_driver", "youngest_age", "oldest_driver", "oldest_age")

# Display sample of results
print("Sample of races with youngest and oldest drivers:")
display(age_results.orderBy("year", "name").limit(5))

# Display sample of results with age column
print("Sample of all drivers with their 'Age' column:")
display(driver_ages.select("name", "year", "driver_name", "age_years")
       .withColumnRenamed("age_years", "Age")  # Rename to match the requirement
       .orderBy("year", "name", "driver_name")
       .limit(5))

# Add explanation of the age calculation approach
"""
Age Calculation Approach:
1. Created a new "age_years" column that calculates each driver's age at the time of the race
2. Age is calculated by finding the difference between race date and birth date in days
3. Divided this difference by 365.25 to convert to years (accounting for leap years)
4. For each race, identified both the youngest and oldest driver
5. This approach gives an accurate age count based on how many birthdays each driver had experienced by race day
"""

Sample of races with youngest and oldest drivers:


year,name,youngest_driver,youngest_age,oldest_driver,oldest_age
1950,Belgian Grand Prix,Geoff Crossley,29,Philippe Étancelin,53
1950,British Grand Prix,Geoff Crossley,29,Philippe Étancelin,53
1950,French Grand Prix,José Froilán González,27,Philippe Étancelin,53
1950,Indianapolis 500,Jimmy Davies,20,Mauri Rose,44
1950,Indianapolis 500,Troy Ruttman,20,Mauri Rose,44


Sample of all drivers with their 'Age' column:


name,year,driver_name,Age
Belgian Grand Prix,1950,Alberto Ascari,31
Belgian Grand Prix,1950,Eugène Chaboud,43
Belgian Grand Prix,1950,Geoff Crossley,29
Belgian Grand Prix,1950,Johnny Claes,33
Belgian Grand Prix,1950,Juan Fangio,38


'\nAge Calculation Approach:\n1. Created a new "age_years" column that calculates each driver\'s age at the time of the race\n2. Age is calculated by finding the difference between race date and birth date in days\n3. Divided this difference by 365.25 to convert to years (accounting for leap years)\n4. For each race, identified both the youngest and oldest driver\n5. This approach gives an accurate age count based on how many birthdays each driver had experienced by race day\n'

In [0]:
# Question 6 (my own) : Which nationality is most common for the drivers? 

from pyspark.sql.functions import count, desc

# Count the number of drivers by nationality
nationality_counts = df_drivers.groupBy("nationality") \
                             .agg(count("driverId").alias("driver_count"))

# Calculate percentage of total for each nationality
total_drivers = df_drivers.count()
nationality_percentages = nationality_counts.withColumn(
    "percentage", 
    (col("driver_count") / total_drivers * 100).cast("decimal(5,2)")
)

# Sort by count in descending order
sorted_nationalities = nationality_percentages.orderBy(desc("driver_count"))

# Display sample of results
print("Top 10 most common nationalities among F1 drivers:")
display(sorted_nationalities.select(
    "nationality", "driver_count", "percentage"
).limit(10))

# Add explanation
"""
Nationality Analysis:
I analyzed which countries have produced the most Formula 1 drivers throughout history.
This helps understand which nations have the strongest motorsport traditions and driver development programs.
"""

Top 10 most common nationalities among F1 drivers:


nationality,driver_count,percentage
British,165,19.25
American,158,18.44
Italian,99,11.55
French,73,8.52
German,50,5.83
Brazilian,32,3.73
Argentine,24,2.8
Swiss,23,2.68
South African,23,2.68
Belgian,23,2.68


'\nNationality Analysis:\nI analyzed which countries have produced the most Formula 1 drivers throughout history.\nThis helps understand which nations have the strongest motorsport traditions and driver development programs.\n'