In [2]:
import fastf1
import pandas as pd
import os
import time
import logging
from datetime import datetime

In [3]:
CACHE_DIR = 'fastf1_cache'
SAVE_DIR = 'f1_data_csvs'
START_YEAR = 2018          # F1 timing/telemetry data generally available from 2018
END_YEAR = datetime.now().year
SESSIONS_TO_GET = ['FP1', 'FP2', 'FP3', 'Q', 'S', 'SQ', 'R'] # Common sessions (Sprint='S', Sprint Quali='SQ')
DELAY_SECONDS = 1         # Delay between processing sessions to be polite to API

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

try:
    fastf1.Cache.enable_cache(CACHE_DIR)
    logging.info(f"FastF1 cache enabled at: {CACHE_DIR}")
except Exception as e:
    logging.error(f"Error enabling FastF1 cache: {e}")


os.makedirs(SAVE_DIR, exist_ok=True)
logging.info(f"CSV data will be saved in: {SAVE_DIR}")

2025-05-17 12:41:06,240 - ERROR - Error enabling FastF1 cache: Cache directory does not exist! Please check for typos or create it first.
2025-05-17 12:41:06,241 - INFO - CSV data will be saved in: f1_data_csvs


In [None]:
import glob
# SESSIONS_TO_GET = ['R']
SESSIONS_TO_GET = ['FP1', 'FP2', 'FP3', 'Q', 'S', 'SQ', 'R']
TMP_PATH = 'airflow/tmp/fastf1-data'
year = 2020
round = 2
event = fastf1.get_event(year, round)
print(event["EventName"])
event_name = event['EventName']
event_round = event['RoundNumber']
logging.info(f"Processing Event: {year} - Round {event_round} - {event_name}")

lap_dfs = []
for session_name in SESSIONS_TO_GET:
    logging.debug(f"Attempting Session: {session_name}")
    session_identifier = f"{year}_{event_round:02d}_{event_name}_{session_name}"

    try:
        session = fastf1.get_session(year, event_name, session_name)

        session.load(laps=True, weather=True, messages=True, telemetry=True)
        logging.info(f"Loaded basic data for {session_identifier}")

        # precise telemetry is in car_data, pos_data, etc.
        # Check https://docs.fastf1.dev/core.html#fastf1.core.Session for more info
        os.makedirs(TMP_PATH, exist_ok=True)

        if hasattr(session, 'laps') and not session.laps.empty:
            lap_dfs.append(session.laps)
            logging.debug(f"Saved laps for {session_identifier}")

            sess_tel_dfs = []
            telemetry_saved = False
            for drv_id in session.drivers:
                try:
                    drv_laps = session.laps.pick_drivers(drv_id)
                    if not drv_laps.empty:
                        drv_abbr = drv_laps['Driver'].iloc[0]
                        drv_tel = drv_laps.get_telemetry()

                        if not drv_tel.empty:
                            drv_tel = drv_tel.merge(drv_laps[['LapNumber', 'Time']], on='Time', how='left')
                            drv_tel['DriverId'] = drv_id
                            drv_tel['Year'] = year
                            drv_tel['Round'] = event_round
                            drv_tel['Session'] = session

                            sess_tel_dfs.append(drv_tel)
                            logging.debug(f"Saved telemetry for driver {drv_abbr} in {session_identifier} with shape {drv_tel.shape}")
                            telemetry_saved = True
                        else:
                            logging.debug(f"No telemetry data returned for driver {drv_abbr} in {session_identifier}")
                except Exception as tel_ex:
                    logging.warning(f"Could not get/save telemetry for driver {drv_id} in {session_identifier}: {tel_ex}")
            if telemetry_saved:
                logging.info(f"Finished processing telemetry for {session_identifier}")
            else:
                logging.info(f"No telemetry saved for any driver in {session_identifier}")

            sess_tel_dfs_concat = pd.concat(sess_tel_dfs, ignore_index=True)
            sess_tel_dfs_concat.to_csv(os.path.join(TMP_PATH, f'telemetry_{session_identifier}.csv'))
    except Exception as e:
        pass

sess_lap_dfs_concat = pd.concat(lap_dfs, ignore_index=True)
sess_lap_dfs_concat.to_csv(os.path.join(TMP_PATH, f"laps_{year}_{round}.csv"))

telemetry_files = glob.glob(os.path.join(TMP_PATH, 'telemetry_*.csv'))
tel_dfs = []

for tel_file in telemetry_files:
    try:
        tel_file_split = os.path.splitext(os.path.basename(tel_file))[0].split('_')
        if int(tel_file_split[1]) == year and int(tel_file_split[2]) == round:
            df_tel = pd.read_csv(tel_file)
            tel_dfs.append(df_tel)

    except Exception as e:
        logging.warning(f"Could not normalize telemetry file {tel_file}: {e}")

df_concat = pd.concat(tel_dfs, ignore_index=True)
df_concat.drop(df_concat.columns[[0]], axis=1, inplace=True)
df_concat.to_csv(os.path.join(TMP_PATH, f'telemetry_{year}_{round}.csv'))

logging.info(f"Normalized telemetry saved for {event_name} with shape {df_concat.shape}")


2025-05-17 12:47:23,219 - INFO - Processing Event: 2020 - Round 2 - Styrian Grand Prix


Styrian Grand Prix


2025-05-17 12:48:02,530 - INFO - Normalized telemetry saved for Styrian Grand Prix with shape (2971902, 23)


In [6]:

# --- Main Loop ---
for year in range(START_YEAR, END_YEAR + 1):
    logging.debug(f"--- Processing Year: {year} ---", )
    try:
        schedule = fastf1.get_event_schedule(year)
        # Convert EventDate to datetime objects to filter past events if needed
        # schedule['EventDate'] = pd.to_datetime(schedule['EventDate']).dt.date
        # schedule = schedule[schedule['EventDate'] < datetime.now().date()] # Optional: Only process past events

    except Exception as e:
        logging.error(f"Could not get event schedule for {year}: {e}")
        continue

    for index, event in schedule.iterrows():
        event_name = event['EventName']
        event_round = event['RoundNumber']
        logging.info(f"Processing Event: {year} - Round {event_round} - {event_name}")

        for session_name in SESSIONS_TO_GET:
            logging.debug(f"Attempting Session: {session_name}")
            session_identifier = f"{year}_{event_round:02d}_{event_name}_{session_name}" # Unique ID for logging/paths
            session_save_path = os.path.join(SAVE_DIR, str(year), f"{event_round:02d}_{event_name}", session_name)

            # # Check if all expected CSVs exist for this session, skip if so
            # expected_files = ['laps.csv', 'results.csv', 'weather.csv', 'messages.csv']
            # csvs_exist = all(os.path.exists(os.path.join(session_save_path, fname)) for fname in expected_files)

            # # Check for at least one telemetry file (since driver list may change)
            # telemetry_files_exist = any(
            #     fname.startswith('telemetry_') and fname.endswith('.csv')
            #     for fname in os.listdir(session_save_path) if os.path.isdir(session_save_path)
            # ) if os.path.isdir(session_save_path) else False

            # if csvs_exist and telemetry_files_exist:
            #     logging.info(f"All CSVs already exist for {session_identifier}, skipping session.")
            #     continue
            try:
                session = fastf1.get_session(year, event_name, session_name)

                session.load(laps=True, weather=True, messages=True, telemetry=True)
                logging.info(f"Loaded basic data for {session_identifier}")

                os.makedirs(session_save_path, exist_ok=True)

                if hasattr(session, 'laps') and not session.laps.empty:
                    session.laps.to_csv(os.path.join(session_save_path, 'laps.csv'), index=False)
                    logging.debug(f"Saved laps for {session_identifier}")

                    logging.info(f"Loading telemetry for {session_identifier}...")
                    telemetry_saved = False
                    for drv_id in session.drivers: # Iterate through driver numbers
                        try:
                            drv_laps = session.laps.pick_drivers(drv_id)
                            if not drv_laps.empty:
                                drv_abbr = drv_laps['Driver'].iloc[0]
                                drv_tel = drv_laps.get_telemetry()

                                if not drv_tel.empty:
                                    drv_tel = drv_tel.merge(drv_laps[['LapNumber', 'SessionTime']], on='SessionTime', how='left')

                                    drv_tel.to_csv(os.path.join(session_save_path, f'telemetry_{drv_abbr}.csv'), index=False)
                                    logging.debug(f"Saved telemetry for driver {drv_abbr} in {session_identifier}")
                                    telemetry_saved = True
                                else:
                                     logging.debug(f"No telemetry data returned for driver {drv_abbr} in {session_identifier}")

                        except Exception as tel_ex:
                            logging.warning(f"Could not get/save telemetry for driver {drv_id} in {session_identifier}: {tel_ex}")
                    if telemetry_saved:
                         logging.info(f"Finished processing telemetry for {session_identifier}")
                    else:
                         logging.info(f"No telemetry saved for any driver in {session_identifier}")


                if hasattr(session, 'results') and not session.results.empty:
                    session.results.to_csv(os.path.join(session_save_path, 'results.csv'), index=False)
                    logging.debug(f"Saved results for {session_identifier}")

                if hasattr(session, 'weather_data') and not session.weather_data.empty:
                    session.weather_data.to_csv(os.path.join(session_save_path, 'weather.csv'), index=False)
                    logging.debug(f"Saved weather for {session_identifier}")

                if hasattr(session, 'messages') and not session.messages.empty:
                    session.messages.to_csv(os.path.join(session_save_path, 'messages.csv'), index=False)
                    logging.debug(f"Saved messages for {session_identifier}")

                logging.info(f"Successfully processed and saved data for {session_identifier}")

            # except fastf1.core.SessionNotAvailableError:
            #      logging.warning(f"Session {session_name} not available or does not exist for {year} {event_name}. Skipping.")
            except fastf1.core.DataNotLoadedError as e:
                logging.error(f"Data not loaded for {session_identifier}. Might be too recent or unavailable. Error: {e}")
            except ConnectionError as e:
                 logging.error(f"Connection error during {session_identifier}: {e}. Check network.")
                 time.sleep(10) # Longer sleep on connection error
            except Exception as e:
                # Catch other potential errors (API issues, unexpected data format, etc.)
                logging.error(f"An unexpected error occurred for {session_identifier}: {e.__class__.__name__} - {e}")

            finally:
                # Add a delay after processing each session regardless of success/failure
                logging.debug(f"Waiting {DELAY_SECONDS} seconds before next session...")
                time.sleep(DELAY_SECONDS)

    logging.info(f"--- Finished Processing Year: {year} ---")

logging.info("--- All Years Processed ---")

2025-05-05 22:45:06,566 - INFO - Processing Event: 2018 - Round 1 - Australian Grand Prix
core           INFO 	Loading data for Australian Grand Prix - Practice 1 [v3.5.3]
2025-05-05 22:45:06,576 - INFO - Loading data for Australian Grand Prix - Practice 1 [v3.5.3]
req            INFO 	Using cached data for session_info
2025-05-05 22:45:06,578 - INFO - Using cached data for session_info
req            INFO 	Using cached data for driver_info
2025-05-05 22:45:06,579 - INFO - Using cached data for driver_info
req            INFO 	Using cached data for session_status_data
2025-05-05 22:45:07,119 - INFO - Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
2025-05-05 22:45:07,120 - INFO - Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
2025-05-05 22:45:07,122 - INFO - Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
2025-05-05 22:4