In [0]:
from pyspark.sql.functions import datediff, current_date, avg
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, max, min, when, substring, upper, floor, months_between, to_date, first

In [0]:
df_results = spark.read.csv('s3://columbia-gr5069-main/raw/results.csv', header=True)
display(df_results)

In [0]:
df_pitstops = spark.read.csv('s3://columbia-gr5069-main/raw/pit_stops.csv', header=True)
display(df_pitstops)

In [0]:
df_laptimes = spark.read.csv('s3://columbia-gr5069-main/raw/lap_times.csv', header=True)
display(df_laptimes)

#1. What was the average time each driver spent at the pit stop for each race?

In [0]:
df_avg_duration = df_pitstops.groupby('raceId', 'driverId').agg(avg('milliseconds')).orderBy("raceId", "avg(milliseconds)")
display(df_avg_duration)

In [0]:
#raceId starts from 1000, change the order to make the raceId starts from 841, and run again
df_avg_duration = df_avg_duration.orderBy(col("raceId").cast("int").asc(), "avg(milliseconds)")
display(df_avg_duration)

# 2.  Rank the average time spent at the pit stop in order of who won each race

In [0]:
#join average pitstops with results 
df_rank_avgpit = df_avg_duration.join(df_results, on=["raceId", "driverId"], how="left")

# order by raceId and position order
df_rank_avgpit = df_rank_avgpit.orderBy(col("raceId").cast("int").asc(), col("positionOrder").cast("int").asc())

#make the df easier to read
df_rank_avgpit = df_rank_avgpit.select("raceId", "driverId", "positionOrder", "avg(milliseconds)")


display(df_rank_avgpit)

# 3.  Insert the missing code (e.g: ALO for Alonso) for drivers based on the 'drivers' dataset

In [0]:
df_driver = spark.read.csv('s3://columbia-gr5069-main/raw/drivers.csv', header=True)
display(df_driver)

In [0]:
df_driver = df_driver.withColumn('code', upper(substring(col("surname"), 1, 3)))
display(df_driver)


I sort out the first three characters of drivers' surname and capitalize them to make the code.

# 4. Who is the youngest and oldest driver for each race? Create a new column called “Age”

In [0]:
df_races = spark.read.csv('s3://columbia-gr5069-main/raw/races.csv', header=True)
df_races = df_races.select("raceId", "year", "name", "date")
display(df_races)

In [0]:
# join driver df with races
df_driver_age = df_results.select("raceId", "driverId").join(df_driver, on="driverId", how="left").join(df_races, on="raceId", how="left")
display(df_driver_age)

In [0]:
df_driver_age = df_driver_age.withColumn("race_date", to_date("date"))
df_driver_age = df_driver_age.withColumn("dob", to_date("dob"))
df_driver_age = df_driver_age.withColumn(
    "Age",
    floor(months_between(col("race_date"), col("dob")) / 12)
)
display(df_driver_age)

In [0]:
age_stats = df_driver_age.groupBy("raceId").agg(
    min("Age").alias("youngest_age"),
    max("Age").alias("oldest_age")
)

# Join back to get driver details for youngest and oldest drivers
youngest_drivers = df_driver_age.join(age_stats, (df_driver_age.Age == age_stats.youngest_age) & (df_driver_age.raceId == age_stats.raceId))
oldest_drivers = df_driver_age.join(age_stats, (df_driver_age.Age == age_stats.oldest_age) & (df_driver_age.raceId == age_stats.raceId))

display(youngest_drivers)
display(oldest_drivers)

In [0]:
# 5. For a given race, which driver has the most wins and losses?
