# F1 Race Time Predection

In [2]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import make_scorer, mean_squared_error
import numpy as np
import joblib
import datetime

### Dataset

In [6]:
# 1. Load dataset
df = pd.read_csv("../Dataset/winners_f1_cleaned.csv")
df.head()

Unnamed: 0,Date,Continent,Grand-Prix,Circuit,Winner-Name,Team,Time,Laps,Year
0,1950-05-13,Europe,Great Britain,Silverstone Circuit,Nino Farina,Alfa Romeo,2025-09-11 02:13:23,70,1950
1,1950-05-21,Europe,Monaco,Circuit de Monaco,Juan Manuel Fangio,Alfa Romeo,2025-09-11 03:13:18,100,1950
2,1950-05-30,North America,United States,Indianapolis Motor Speedway,Johnnie Parsons,Kurtis Kraft Offenhauser,2025-09-11 02:46:55,138,1950
3,1950-06-04,Europe,Switzerland,Circuit Bremgarten,Nino Farina,Alfa Romeo,2025-09-11 02:02:53,42,1950
4,1950-06-18,Europe,Belgium,Circuit de Spa Francorchamps,Juan Manuel Fangio,Alfa Romeo,2025-09-11 02:47:26,35,1950


In [7]:
# Convert Time hh:mm:ss format into time in seconds
df["Time"] = pd.to_datetime(df["Time"])
df['Time-Seconds'] = df["Time"].dt.hour.astype(int) * 3600 + df["Time"].dt.minute.astype(int) * 60 + df["Time"].dt.second.astype(int)

In [8]:
# 2. Define the features (X) and target (y)
X = df[["Continent", "Team", "Laps", "Year"]]
y = df["Time-Seconds"]

In [9]:
# 3. Preprocessing and encoding categorical data
categorical = ["Continent", "Team"]
numeric = ["Laps", "Year"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical)
    ],
    remainder="passthrough"
)


In [10]:
# 4. Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Model

In [11]:
# 5. Build model
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", LinearRegression())
])

# Fit model
model.fit(X_train, y_train)

In [12]:
# 6. Evaluation
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE={rmse:.2f}\nR²={r2:.3f}")

RMSE=1091.24
R²=0.608




In [13]:
# 7. Save model
joblib.dump(model, "f1_model.pkl")

['f1_model.pkl']

### Prediction

In [14]:
# Sample input for 2026
next_year_data = pd.DataFrame([{
    "Continent": "Europe",
    "Team": "Ferrari",
    "Laps": 70,
    "Year": 2026
}])

# Predict
prediction = model.predict(next_year_data)[0]

# Convert seconds into hh:mm:ss format
pred_seconds = prediction
time_str = str(datetime.timedelta(seconds=int(pred_seconds)))

print("Predicted race time:", time_str)

Predicted race time: 1:15:28


In [17]:
# # Avrage race time of targeted number of laps (for comparision)
mask = (
    (df["Continent"] == "Europe") &
    (df["Team"] == "Ferrari") &
    (df["Laps"] == 70) &
    (df["Year"] < 2026)
)

subset = df[mask]

mask_average = subset["Time-Seconds"].mean()
mask_average = str(datetime.timedelta(seconds=int(mask_average)))
print(f"Average race time: {mask_average}")

Average race time: 1:38:10
