In [18]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [19]:
df = pd.read_csv("../data/f1_features_2025_enriched.csv")
print("Initial shape:", df.shape)
df.head()

Initial shape: (439, 12)


Unnamed: 0,date,season,round,circuit,driver,constructor,grid,position,qualifying_time,air_temp,track_temp,humidity
0,2024-03-02,2024,1,Bahrain Grand Prix,Max Verstappen,Red Bull Racing,1.0,1.0,89.179,18.2,22.2,48.0
1,2024-03-02,2024,1,Bahrain Grand Prix,Sergio Perez,Red Bull Racing,5.0,2.0,89.537,18.2,22.2,48.0
2,2024-03-02,2024,1,Bahrain Grand Prix,Carlos Sainz,Ferrari,4.0,3.0,89.507,18.2,22.2,48.0
3,2024-03-02,2024,1,Bahrain Grand Prix,Charles Leclerc,Ferrari,2.0,4.0,89.165,18.2,22.2,48.0
4,2024-03-02,2024,1,Bahrain Grand Prix,George Russell,Mercedes,3.0,5.0,89.485,18.2,22.2,48.0


In [20]:
# Check what's actually in qualifying_time
print("Unique values in qualifying_time column:")
print(df['qualifying_time'].unique()[:10])  # show first 10

# Force 'NaT' strings to NaN
df['qualifying_time'] = df['qualifying_time'].replace('NaT', pd.NA)

# Convert to timedelta
df['qualifying_time'] = pd.to_timedelta(df['qualifying_time'], errors='coerce')

# Convert to seconds
df['qualifying_time'] = df['qualifying_time'].apply(lambda x: x.total_seconds() if pd.notnull(x) else None)

# Preview the cleaned column
df[['driver', 'qualifying_time']].head()


Unique values in qualifying_time column:
[89.179 89.537 89.507 89.165 89.485 89.614 89.71  89.683 89.542 89.965]


Unnamed: 0,driver,qualifying_time
0,Max Verstappen,0.0
1,Sergio Perez,0.0
2,Carlos Sainz,0.0
3,Charles Leclerc,0.0
4,George Russell,0.0


In [21]:
required_cols = ['position', 'grid', 'qualifying_time', 'air_temp', 'track_temp', 'humidity']
df = df.dropna(subset=required_cols)
print("✅ Rows after dropping NaNs:", len(df))


✅ Rows after dropping NaNs: 425


In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
import xgboost as xgb
import joblib
import numpy as np

# Load enriched datasets
print("📥 Loading enriched datasets...")
df_2024 = pd.read_csv("../data/f1_features_2024_with_features.csv")
df_2025 = pd.read_csv("../data/f1_features_2025_enriched.csv")

# Tag the seasons
df_2024['season'] = 2024
df_2025['season'] = 2025

# Combine into one dataset
df = pd.concat([df_2024, df_2025], ignore_index=True)

# Convert qualifying_time to seconds (if needed)
if df['qualifying_time'].dtype == 'object':
    df['qualifying_time'] = pd.to_timedelta(df['qualifying_time'], errors='coerce')
    df['qualifying_time'] = df['qualifying_time'].dt.total_seconds()

# Drop rows with missing values in important columns
required_cols = ['position', 'grid', 'qualifying_time', 'driver_form', 'constructor_form', 'circuit_encoded']
df.dropna(subset=required_cols, inplace=True)
print(f"✅ Rows after dropping NaNs: {len(df)}")

# Assign sample weights (2025 = 1.0, 2024 weighted by round)
def assign_weight(row):
    if row['season'] == 2025:
        return 1.0
    elif row['season'] == 2024:
        rnd = row['round']
        if rnd <= 3:
            return 0.8
        elif rnd <= 6:
            return 0.5
        elif rnd <= 9:
            return 0.2
        else:
            return 0.0
    return 1.0

df['sample_weight'] = df.apply(assign_weight, axis=1)
df = df[df['sample_weight'] > 0.0]

# Features and target
features = [
    'grid', 'driver_form', 'constructor_form',
    'circuit_encoded', 'grid_advantage', 'qualifying_time',
    'air_temp', 'track_temp', 'humidity'
]
X = df[features]
y = df['position']
weights = df['sample_weight']

# Train/test split
X_train, X_test, y_train, y_test, weights_train, weights_test = train_test_split(
    X, y, weights, test_size=0.2, random_state=42
)

# Train model
model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=4,
    random_state=42
)
model.fit(X_train, y_train, sample_weight=weights_train)

# Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("\n📊 Evaluation Metrics:")
print(f"MAE:  {mae:.2f}")
print(f"RMSE: {rmse:.2f}")

# Save model
joblib.dump(model, "../models/f1_position_model_2025.pkl")
print("💾 Model saved as 'f1_position_model_2025.pkl'")


📥 Loading enriched datasets...
✅ Rows after dropping NaNs: 402

📊 Evaluation Metrics:
MAE:  1.38
RMSE: 1.94
💾 Model saved as 'f1_position_model_2025.pkl'
