# F1 Data Collection Notebook

This notebook allows you to interactively collect F1 race data using the enhanced FastF1 collector. You can specify the years, session types, and output directories, and inspect the collected data directly in the notebook.

In [1]:
import sys
import os
from pathlib import Path
import pandas as pd
import fastf1
from tqdm import tqdm

# Add src to path for imports
sys.path.append(str(Path(os.getcwd()).parent / 'src'))
sys.path.append(str(Path(os.getcwd()).parent))

from src.data_pipeline.fastf1_collector import F1DataCollector

## Configure Data Collection Parameters

In [2]:
# User parameters
years = [2023, 2024]  # List of years to collect
sessions = ['Q', 'R']  # Session types: FP1, FP2, FP3, Q, R, S
output_dir = 'data/raw'  # Output directory for collected data
cache_dir = 'data/cache'  # FastF1 cache directory
save_individual = True  # Save individual event/session files

print(f'Years: {years}')
print(f'Sessions: {sessions}')
print(f'Output directory: {output_dir}')
print(f'Cache directory: {cache_dir}')

Years: [2023, 2024]
Sessions: ['Q', 'R']
Output directory: data/raw
Cache directory: data/cache


In [3]:
# Initialize the data collector
collector = F1DataCollector(cache_dir=cache_dir)
fastf1.Cache.enable_cache(cache_dir)

## Collect Data for Each Year and Session
This cell will collect data for the specified years and sessions, and save the results to the output directory.

In [4]:
all_data = []
for year in years:
    print(f'\nCollecting data for {year}...')
    season_data = collector.get_season_data(year, sessions)
    all_data.append(season_data)
    # Save combined data for the season
    season_file = Path(output_dir) / f'season_{year}_data.csv'
    season_file.parent.mkdir(parents=True, exist_ok=True)
    season_data.to_csv(season_file, index=False)
    print(f'Saved season data to {season_file}')
    # Optionally save individual event/session files
    if save_individual:
        for (event, session), event_data in season_data.groupby(['Event', 'Session']):
            event_clean = event.replace(' ', '_')
            event_file = Path(output_dir) / 'events' / f'{year}_{event_clean}_{session}.csv'
            event_file.parent.mkdir(parents=True, exist_ok=True)
            event_data.to_csv(event_file, index=False)
            print(f'Saved {event} {session} data to {event_file}')

# Combine all data
if all_data:
    combined_data = pd.concat(all_data, ignore_index=True)
    combined_file = Path(output_dir) / f'seasons_{"_".join(map(str, years))}_data.csv'
    combined_data.to_csv(combined_file, index=False)
    print(f'Combined data saved to {combined_file}')
else:
    print('No data collected.')

INFO:src.data_pipeline.fastf1_collector:Fetching data for 2023 season



Collecting data for 2023...


INFO:src.data_pipeline.fastf1_collector:Fetching data for 2023 Pre-Season Testing Q
core           INFO 	Loading data for British Grand Prix - Qualifying [v3.5.3]
INFO:fastf1.fastf1.core:Loading data for British Grand Prix - Qualifying [v3.5.3]
req            INFO 	Using cached data for session_info
INFO:fastf1.fastf1.req:Using cached data for session_info
req            INFO 	Using cached data for driver_info
INFO:fastf1.fastf1.req:Using cached data for driver_info
DEBUG:fastf1.ergast:Failed to parse timestamp '' in Ergastresponse.
req            INFO 	Using cached data for session_status_data
INFO:fastf1.fastf1.req:Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
INFO:fastf1.fastf1.req:Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
INFO:fastf1.fastf1.req:Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
INFO:fastf1.fastf1

Saved season data to data\raw\season_2023_data.csv
Saved Abu Dhabi Grand Prix Q data to data\raw\events\2023_Abu_Dhabi_Grand_Prix_Q.csv
Saved Abu Dhabi Grand Prix R data to data\raw\events\2023_Abu_Dhabi_Grand_Prix_R.csv
Saved Australian Grand Prix Q data to data\raw\events\2023_Australian_Grand_Prix_Q.csv
Saved Australian Grand Prix R data to data\raw\events\2023_Australian_Grand_Prix_R.csv
Saved Austrian Grand Prix Q data to data\raw\events\2023_Austrian_Grand_Prix_Q.csv
Saved Austrian Grand Prix R data to data\raw\events\2023_Austrian_Grand_Prix_R.csv
Saved Azerbaijan Grand Prix Q data to data\raw\events\2023_Azerbaijan_Grand_Prix_Q.csv
Saved Azerbaijan Grand Prix R data to data\raw\events\2023_Azerbaijan_Grand_Prix_R.csv
Saved Bahrain Grand Prix Q data to data\raw\events\2023_Bahrain_Grand_Prix_Q.csv
Saved Bahrain Grand Prix R data to data\raw\events\2023_Bahrain_Grand_Prix_R.csv
Saved Belgian Grand Prix Q data to data\raw\events\2023_Belgian_Grand_Prix_Q.csv
Saved Belgian Grand Pr

INFO:src.data_pipeline.fastf1_collector:Fetching data for 2024 season


Saved United States Grand Prix Q data to data\raw\events\2023_United_States_Grand_Prix_Q.csv
Saved United States Grand Prix R data to data\raw\events\2023_United_States_Grand_Prix_R.csv

Collecting data for 2024...


INFO:src.data_pipeline.fastf1_collector:Fetching data for 2024 Pre-Season Testing Q
core           INFO 	Loading data for Singapore Grand Prix - Qualifying [v3.5.3]
INFO:fastf1.fastf1.core:Loading data for Singapore Grand Prix - Qualifying [v3.5.3]
req            INFO 	No cached data found for session_info. Loading data...
INFO:fastf1.fastf1.req:No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
INFO:fastf1.api:Fetching session info data...
req            INFO 	Data has been written to cache!
INFO:fastf1.fastf1.req:Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
INFO:fastf1.fastf1.req:No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
INFO:fastf1.api:Fetching driver list...
req            INFO 	Data has been written to cache!
INFO:fastf1.fastf1.req:Data has been written to cache!
DEBUG:fastf1.ergast:Failed to parse timestamp '

Saved season data to data\raw\season_2024_data.csv
Saved Australian Grand Prix Q data to data\raw\events\2024_Australian_Grand_Prix_Q.csv
Saved Australian Grand Prix R data to data\raw\events\2024_Australian_Grand_Prix_R.csv
Saved Austrian Grand Prix Q data to data\raw\events\2024_Austrian_Grand_Prix_Q.csv
Saved Austrian Grand Prix R data to data\raw\events\2024_Austrian_Grand_Prix_R.csv
Saved Bahrain Grand Prix Q data to data\raw\events\2024_Bahrain_Grand_Prix_Q.csv
Saved Bahrain Grand Prix R data to data\raw\events\2024_Bahrain_Grand_Prix_R.csv
Saved British Grand Prix Q data to data\raw\events\2024_British_Grand_Prix_Q.csv
Saved British Grand Prix R data to data\raw\events\2024_British_Grand_Prix_R.csv
Saved Canadian Grand Prix Q data to data\raw\events\2024_Canadian_Grand_Prix_Q.csv
Saved Canadian Grand Prix R data to data\raw\events\2024_Canadian_Grand_Prix_R.csv
Saved Chinese Grand Prix Q data to data\raw\events\2024_Chinese_Grand_Prix_Q.csv
Saved Chinese Grand Prix R data to dat

## Inspect Collected Data
You can now load and inspect the collected data.

In [5]:
# Load the combined data for inspection
combined_file = Path(output_dir) / f'seasons_{"_".join(map(str, years))}_data.csv'
if combined_file.exists():
    df = pd.read_csv(combined_file)
    display(df.head())
    print(f'Total records: {len(df)}')
else:
    print('Combined data file not found.')

  df = pd.read_csv(combined_file)


Unnamed: 0,Time,Driver,DriverNumber,LapTime,LapNumber,Stint,PitOutTime,PitInTime,Sector1Time,Sector2Time,...,Circuit_Lat,Circuit_Long,IsSoft,IsMedium,IsHard,IsIntermediate,IsWet,CompoundHardness,HasPitIn,HasPitOut
0,0 days 00:18:01.997000,VER,1,,1.0,1.0,0 days 00:15:56.201000,,,0 days 00:00:44.778000,...,1192.508301,4503.826172,True,False,False,False,False,1.0,False,True
1,0 days 00:19:35.532000,VER,1,0 days 00:01:33.535000,2.0,1.0,,,0 days 00:00:28.534000,0 days 00:00:36.657000,...,1192.508301,4503.826172,True,False,False,False,False,1.0,False,False
2,0 days 00:21:07.677000,VER,1,0 days 00:01:32.145000,3.0,1.0,,,0 days 00:00:28.441000,0 days 00:00:36.572000,...,1192.508301,4503.826172,True,False,False,False,False,1.0,False,False
3,0 days 00:23:45.559000,VER,1,,4.0,1.0,,,0 days 00:00:48.387000,0 days 00:01:09.690000,...,1192.508301,4503.826172,True,False,False,False,False,1.0,False,False
4,0 days 00:25:16.278000,VER,1,0 days 00:01:30.719000,5.0,1.0,,,0 days 00:00:28.109000,0 days 00:00:36.109000,...,1192.508301,4503.826172,True,False,False,False,False,1.0,False,False


Total records: 55072
