In [0]:
import mlflow
import boto3
import pandas as pd

In [0]:
# Set up S3 client and bucket
s3 = boto3.client('s3')
bucket = "columbia-gr5069-main"

# List of files to load from the bucket
keys = {
    "drivers": "raw/drivers.csv",
    "races": "raw/races.csv",
    "results": "raw/results.csv",
    "constructors": "raw/constructors.csv",
}

# Dictionary to store the loaded DataFrames
dataframes = {}

# Loop through and load each CSV into a DataFrame
for name, key in keys.items():
    obj = s3.get_object(Bucket=bucket, Key=key)
    df = pd.read_csv(obj['Body'])
    dataframes[name] = df
    print(f"Loaded {name} ({df.shape[0]} rows, {df.shape[1]} columns)")

# Example usage:
drivers_df = dataframes['drivers']
races_df = dataframes['races']
results_df = dataframes['results']
constructors_df = dataframes['constructors']

# Preview a DataFrame
display(drivers_df)
display(races_df)
display(results_df)
display(constructors_df)

In [0]:
# merge four dataframes into one
merged_df = pd.merge(results_df, races_df, on='raceId', how='left', suffixes=('', '_race'))
merged_df = pd.merge(merged_df, drivers_df, on='driverId', how='left', suffixes=('', '_driver'))
merged_df = pd.merge(merged_df, constructors_df, on='constructorId', how='left', suffixes=('', '_constructor'))
merged_df['top_10'] = merged_df['positionOrder'] <= 10
merged_df = merged_df[merged_df['positionOrder'].notnull()]

merged_df['dob'] = pd.to_datetime(merged_df['dob'], errors='coerce')
merged_df['date'] = pd.to_datetime(merged_df['date'], errors='coerce')

# Calculate driver age at time of race
merged_df['driver_age'] = (merged_df['date'] - merged_df['dob']).dt.days // 365

# Select modeling features
features = [
    'grid',
    'constructorRef',
    'nationality',
    'driver_age',
    'year',
    'round'
]

# Drop rows with missing values in selected features
model_df = merged_df[features + ['top_10']].dropna()

from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical features
categorical = ['constructorRef', 'nationality']
model_df = pd.get_dummies(model_df, columns=categorical, drop_first=True)
model_df.head()