# Scraping

In [42]:
# Import libraries and declare API URL
import sys
import os
sys.path.append(os.path.abspath(".."))

from lib.scraper import get_json
import pandas as pd
from tqdm import tqdm

API_URL = "https://api.openf1.org/v1"

In [43]:
# Get race sessions from 2023 to 2025
sessions = get_json(f"{API_URL}/sessions?session_type=Race")
race_sessions = [
    s for s in sessions
    if 2023 <= s.get("year", 0) <= 2025
]

In [44]:
# Function to collect lap data from session
def collect_lap_data(session):
    session_key = session["session_key"]
    year = session["year"]
    track = session.get("circuit_short_name", session.get("circuit_name", ""))

    # Laps
    laps = get_json(f"{API_URL}/laps", params={"session_key": session_key})

    # Tyre stints (includes tyres used from earlier sessions)
    stints = get_json(f"{API_URL}/stints", params={"session_key": session_key})

    # Convert stints into easy lookup
    # Keyed by driver_number
    stint_dict = {}
    for s in stints:
        dn = s["driver_number"]
        if dn not in stint_dict:
            stint_dict[dn] = []
        stint_dict[dn].append(s)

    rows = []
    for lap in laps:
        dn = lap["driver_number"]
        lap_number = lap["lap_number"]

        # Find the stint this lap belongs to
        tyre_compound = None
        tyre_age = None
        
        if dn in stint_dict:
            for stint in stint_dict[dn]:
                if not stint["lap_start"] == None or not stint["lap_end"] == None:
                    if stint["lap_start"] <= lap_number <= stint["lap_end"]:
                        tyre_compound = stint.get("compound")
                        tyre_age = lap_number - stint["lap_start"] + 1

        rows.append({
            "year": year,
            "track": track,
            "session_key": session_key,
            "driver_number": dn,
            "lap_number": lap_number,
            "is_pit_out_lap": lap.get("is_pit_out_lap"),
            "lap_duration": lap.get("lap_duration"),
            "tyre_compound": tyre_compound,
            "tyre_age_laps": tyre_age
        })

    return rows

In [45]:
all_rows = []
print(f"Found {len(race_sessions)} race sessions from {2023}-{2025}")

for session in tqdm(race_sessions, desc="Processing sessions"):
    rows = collect_lap_data(session)
    all_rows.extend(rows)
    
df = pd.DataFrame(all_rows)
df

Found 88 race sessions from 2023-2025


Processing sessions: 100%|██████████| 88/88 [00:00<00:00, 190.81it/s]


Unnamed: 0,year,track,session_key,driver_number,lap_number,is_pit_out_lap,lap_duration,tyre_compound,tyre_age_laps
0,2023,Sakhir,7953,1,1,False,,SOFT,1.0
1,2023,Sakhir,7953,11,1,False,,SOFT,1.0
2,2023,Sakhir,7953,16,1,False,,SOFT,1.0
3,2023,Sakhir,7953,31,1,False,,SOFT,1.0
4,2023,Sakhir,7953,27,1,False,,SOFT,1.0
...,...,...,...,...,...,...,...,...,...
84080,2025,Yas Marina Circuit,9839,27,58,False,88.107,MEDIUM,16.0
84081,2025,Yas Marina Circuit,9839,55,58,False,90.046,HARD,40.0
84082,2025,Yas Marina Circuit,9839,22,58,False,89.566,MEDIUM,26.0
84083,2025,Yas Marina Circuit,9839,12,58,False,89.630,MEDIUM,26.0


# Cleaning

In [None]:
# Backfilling lap_duration
mask = df["lap_number"] <= 2
df.loc[mask, "lap_duration"] = df[mask].groupby(["session_key", "driver_number"])["lap_duration"].bfill()
df

Unnamed: 0,year,track,session_key,driver_number,lap_number,is_pit_out_lap,lap_duration,tyre_compound,tyre_age_laps
0,2023,Sakhir,7953,1,1,False,97.974,SOFT,1.0
1,2023,Sakhir,7953,11,1,False,98.862,SOFT,1.0
2,2023,Sakhir,7953,16,1,False,98.750,SOFT,1.0
3,2023,Sakhir,7953,31,1,False,100.408,SOFT,1.0
4,2023,Sakhir,7953,27,1,False,100.720,SOFT,1.0
...,...,...,...,...,...,...,...,...,...
84080,2025,Yas Marina Circuit,9839,27,58,False,88.107,MEDIUM,16.0
84081,2025,Yas Marina Circuit,9839,55,58,False,90.046,HARD,40.0
84082,2025,Yas Marina Circuit,9839,22,58,False,89.566,MEDIUM,26.0
84083,2025,Yas Marina Circuit,9839,12,58,False,89.630,MEDIUM,26.0


In [47]:
# Remove 2025 Spa due to wet race

mask1 = df["year"] == 2025
mask2 = df["track"] == "Spa-Francorchamps"

df.drop(df[mask1 & mask2].index, inplace=True)

In [48]:
# Remove 2025 Melbourne due to wet race

mask1 = df["year"] == 2025
mask2 = df["track"] == "Melbourne"

df.drop(df[mask1 & mask2].index, inplace=True)

In [49]:
# Remove outliers from 2023 Melbourne, lap 8 red flag and lap 54 until end of race

mask1 = df["year"] == 2023
mask2 = (df["lap_number"] == 8) | (df["lap_number"] >= 54)
mask3 = df["track"] == "Melbourne"

df.drop(df[mask1 & mask2 & mask3].index, inplace=True)

In [None]:
# Remove 2024 Suzuka lap outliers (>250 seconds)

mask1 = df["year"] == 2024
mask2 = df["lap_duration"] >= 250
mask3 = df["track"] == "Suzuka"

df.drop(df[mask1 & mask2 & mask3].index, inplace=True)

In [54]:
# Remove pit out laps
df.drop(df[df["is_pit_out_lap"] == True].index, inplace=True)

In [55]:
# Save to CSV
df.to_csv("../data/f1_lap_data_2023_to_2025.csv", index=False)