In [0]:
import mlflow
import boto3
import pandas as pd

In [0]:
# Set up S3 client and bucket
s3 = boto3.client('s3')
bucket = "columbia-gr5069-main"

# List of files to load from the bucket
keys = {
    "races": "raw/races.csv",
    "results": "raw/results.csv",
}

# Dictionary to store the loaded DataFrames
dataframes = {}

# Loop through and load each CSV into a DataFrame
for name, key in keys.items():
    obj = s3.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(obj['Body'])
    dataframes[name] = df
    print(f"Loaded {name} ({df.shape[0]} rows, {df.shape[1]} columns)")

# Example usage:
races_df = dataframes['races']
results_df = dataframes['results']

# Preview a DataFrame
display(races_df)
display(results_df)

In [0]:
from pyspark.sql.functions import col

# Convert races_df from pandas to PySpark
races_df_spark = spark.createDataFrame(races_df)
results_df_spark = spark.createDataFrame(results_df)

# Join results_df with races_df_spark on "raceId" to get "year"
data_df = results_df_spark.join(races_df_spark.select("raceId", "year"), on="raceId", how="inner")

# Select only needed columns
data_df = data_df.select(
    "grid", "constructorId", "driverId", "year", "position"
)

# Filter out rows where position is null or '\N' (invalid)
data_df = data_df.filter(
    (col("position").isNotNull()) & (col("position") != '\\N')
)

# Convert "position" column to integer
data_df = data_df.withColumn("position", col("position").cast("int"))

# Show sample
data_df.show(5)

Question 1: Create two (2) new tables in your own fatabse where you'll store the predictions from each model for this exercise.