In [1]:
from __future__ import annotations
from pathlib    import Path

import numpy  as np
import pandas as pd

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 120)
pd.set_option('display.width', 140)

In [2]:
TARGET_YEARS: tuple[int, ...] | None = None
SESSION_SCOPE = 'ALL'
USE_LAPS = False
DATA_DIR = Path('../data/openf1_full')

In [3]:
from pandas.api.types import is_datetime64_any_dtype as is_dt

INT64 = 'Int64'
INT32 = 'Int32'

def as_int(df: pd.DataFrame, cols, kind = INT64):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors= 'coerce').astype(kind)

def as_float(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors= 'coerce')

def as_dt_utc(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns and not is_dt(df[c]):
            df[c] = pd.to_datetime(df[c], errors= 'coerce', utc = True)

def upper_str(df: pd.DataFrame, cols):
    for c in cols:
        if c in df.columns:
            df[c] = df[c].astype('string').str.strip().str.upper()

In [5]:
sessions_df       = pd.read_csv(DATA_DIR / "sessions_all.csv")
stints_df         = pd.read_csv(DATA_DIR / "stints_all.csv")
pit_df            = pd.read_csv(DATA_DIR / "pit_all.csv")
weather_df        = pd.read_csv(DATA_DIR / "weather_all.csv")
starting_grid_df  = pd.read_csv(DATA_DIR / "starting_grid_all.csv")
session_result_df = pd.read_csv(DATA_DIR / "session_result_all.csv")
race_control_df   = pd.read_csv(DATA_DIR / "race_control_all.csv")
laps_df           = pd.read_csv(DATA_DIR / "laps_all.csv") if USE_LAPS else None

In [7]:
as_int(sessions_df, ["session_key", "meeting_key", "year"], kind = INT32)
as_dt_utc(sessions_df, ["date_start", "date_end", "session_start_utc", "session_end_utc"])
upper_str(sessions_df, ["session_name", "session_type", "country_name"])

as_int(stints_df, ["session_key", "driver_number"], kind = INT64)
as_int(stints_df, ["stint_number", "lap_start", "lap_end"], kind = INT32)
upper_str(stints_df, ["compound"])

as_int(pit_df, ["session_key", "driver_number"], kind = INT64)
as_int(pit_df, ["lap_number"], kind = INT32)
as_dt_utc(pit_df, ["date"])

as_int(weather_df, ["session_key"], kind = INT64)
as_dt_utc(weather_df, ["date"])
as_float(weather_df, ["rainfall", "wind_direction"])

as_int(starting_grid_df, ["session_key", "driver_number"], kind = INT64)
as_int(starting_grid_df, ["position"], kind = INT32)

as_int(session_result_df, ["session_key", "driver_number"], kind = INT64)
as_int(session_result_df, ["position", "number_of_laps"], kind = INT32)

as_int(race_control_df, ["session_key", "driver_number"], kind = INT64)
as_int(race_control_df, ["lap_number", "sector"], kind = INT32)
as_dt_utc(race_control_df, ["date"])
upper_str(race_control_df, ["category", "flag", "message", "scope"])

if USE_LAPS and laps_df is not None:
    as_int(laps_df, ["session_key", "driver_number", "lap_number"], kind = INT64)
    as_dt_utc(laps_df, ["date"])


In [8]:
tables = {
    'sessions':       sessions_df,
    'stints':         stints_df,
    'pit':            pit_df,
    'weather':        weather_df,
    'starting_grid':  starting_grid_df,
    'session_result': session_result_df,
    'race_control':   race_control_df,
}

table_counts = (
    pd.DataFrame({'table': list(tables.keys()),
                  'rows': [len(t) for t in tables.values()],
                  'cols': [t.shape[1] for t in tables.values()]})
    .sort_values('table')
    .reset_index(drop = True)
)

table_counts

Unnamed: 0,table,rows,cols
0,pit,3033,6
1,race_control,6901,10
2,session_result,1836,11
3,sessions,323,14
4,starting_grid,298,5
5,stints,4881,8
6,weather,12179,10


In [9]:
uniques = pd.DataFrame([
    {
        'table': 'stints',
        'key_cols': 'session_key,driver_number,stint_number',
        'duplicates': int(stints_df.duplicated(['session_key', 'driver_number', 'stint_number']).sum()),
    },
    {
        'table': 'starting_grid',
        'key_cols': 'session_key,driver_number',
        'duplicates': int(starting_grid_df.duplicated(['session_key', 'driver_number']).sum()),
    },
    {
        'table': 'pit',
        'key_cols': 'session_key,driver_number,lap_number',
        'duplicates': int(pit_df.duplicated(['session_key', 'driver_number', 'lap_number']).sum()),
    },
])

uniques

Unnamed: 0,table,key_cols,duplicates
0,stints,"session_key,driver_number,stint_number",0
1,starting_grid,"session_key,driver_number",0
2,pit,"session_key,driver_number,lap_number",0


In [10]:
sessions_keys = set(pd.to_numeric(sessions_df['session_key'], errors = 'coerce').dropna().astype(int))

def missing_keys(df: pd.DataFrame) -> int:
    if 'session_key' not in df.columns:
        return pd.NA
    keys = set(pd.to_numeric(df['session_key'], errors = 'coerce').dropna().astype(int))
    return len(keys - sessions_keys)

key_consistency = pd.DataFrame([
    {'table': 'stints',         'missing_session_keys': missing_keys(stints_df)},
    {'table': 'pit',            'missing_session_keys': missing_keys(pit_df)},
    {'table': 'weather',        'missing_session_keys': missing_keys(weather_df)},
    {'table': 'starting_grid',  'missing_session_keys': missing_keys(starting_grid_df)},
    {'table': 'session_result', 'missing_session_keys': missing_keys(session_result_df)},
    {'table': 'race_control',   'missing_session_keys': missing_keys(race_control_df)},
])

key_consistency

Unnamed: 0,table,missing_session_keys
0,stints,0
1,pit,0
2,weather,0
3,starting_grid,0
4,session_result,0
5,race_control,0


In [11]:
target_years = (
    sorted(sessions_df['year'].dropna().unique().tolist())
    if TARGET_YEARS is None else list(TARGET_YEARS)
)
sessions_ref = sessions_df[sessions_df['year'].isin(target_years)].copy()
sessions_ref = sessions_ref[['session_key', 'meeting_key', 'year', 'session_name', 'date_start']].sort_values(
    ['year', 'session_key']
).reset_index(drop = True)
session_keys = sessions_ref['session_key'].tolist()
sessions_per_year = sessions_ref.groupby('year').size().rename('n').reset_index()
len(sessions_ref), len(session_keys), sessions_per_year.head()

(323,
 323,
    year    n
 0  2023  113
 1  2024  123
 2  2025   87)

In [12]:
wx_metrics = {
    'air_temperature':   ['mean', 'median'],
    'track_temperature': ['mean', 'median'],
    'wind_speed':        ['mean'],
    'humidity':          ['mean'],
    'pressure':          ['mean'],
    'rainfall':          ['sum', 'mean'],
}
wx_agg = (
    weather_df
    .groupby('session_key')
    .agg(**{f"{col}_{stat}": (col, stat) for col, stats in wx_metrics.items() for stat in stats})
    .reset_index()
)
wx_span = (
    weather_df.groupby('session_key')['date']
    .agg(wx_start = 'min', wx_end = 'max', wx_rows = 'count')
    .reset_index()
)
weather_agg = wx_agg.merge(wx_span, on = 'session_key', how = 'left')
weather_agg.head()

Unnamed: 0,session_key,air_temperature_mean,air_temperature_median,track_temperature_mean,track_temperature_median,wind_speed_mean,humidity_mean,pressure_mean,rainfall_sum,rainfall_mean,wx_start,wx_end,wx_rows
0,7779,26.091892,26.0,31.792568,31.6,1.772297,57.790541,1010.92973,0,0.0,2023-03-19 16:01:59.956000+00:00,2023-03-19 18:29:00.468000+00:00,148
1,7787,17.44955,17.5,30.13964,30.95,1.127027,54.157658,1018.366667,0,0.0,2023-04-02 04:01:37.633000+00:00,2023-04-02 07:42:38.058000+00:00,221
2,7953,27.431677,27.1,31.011801,30.6,0.68323,21.496894,1016.863975,0,0.0,2023-03-05 14:01:47.286000+00:00,2023-03-05 16:41:47.909000+00:00,161
3,9069,22.20122,22.2,36.831707,35.9,0.730488,58.280488,1008.90122,2,0.02439,2023-04-29 12:49:07.212000+00:00,2023-04-29 14:10:07.436000+00:00,82
4,9070,24.860625,24.9,41.21,41.75,1.083125,49.225,1008.64625,0,0.0,2023-04-30 10:01:11.577000+00:00,2023-04-30 12:40:12.061000+00:00,160


In [13]:
txt = (
    race_control_df["category"].fillna("") + " " +
    race_control_df["flag"].fillna("")    + " " +
    race_control_df["message"].fillna("")
)
vsc = txt.str.contains(r"\bVIRTUAL SAFETY CAR\b|\bVSC\b", regex=True)
sc  = txt.str.contains(r"\bSAFETY CAR\b", regex=True) & ~vsc
yel = txt.str.contains(r"\bYELLOW\b", regex=True)

race_control_agg = (
    pd.DataFrame({"sc_events": sc, "vsc_events": vsc, "yellow_events": yel})
    .groupby(race_control_df["session_key"], sort=True)
    .sum()
    .reset_index()
)
race_control_agg.head()

Unnamed: 0,session_key,sc_events,vsc_events,yellow_events
0,7779,2,0,5
1,7787,5,2,12
2,7953,0,2,4
3,9069,2,1,12
4,9070,2,0,12


In [15]:
sessions_cut = (
    sessions_ref.drop_duplicates(subset = ['session_key'])
    .set_index('session_key')[['meeting_key', 'year', 'session_name', 'date_start']]
)
sessions_cut.head()

Unnamed: 0_level_0,meeting_key,year,session_name,date_start
session_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7763,1140,2023,PRACTICE 2,2023-02-24 07:00:00+00:00
7764,1140,2023,PRACTICE 3,2023-02-25 07:00:00+00:00
7765,1141,2023,PRACTICE 1,2023-03-03 11:30:00+00:00
7766,1141,2023,PRACTICE 2,2023-03-03 15:00:00+00:00
7767,1141,2023,PRACTICE 3,2023-03-04 11:30:00+00:00


In [22]:
race_keys = sessions_df.loc[
    sessions_df['session_type'].str.upper().isin(['RACE', 'SPRINT']),
    'session_key'
]

st = stints_df[stints_df['session_key'].isin(race_keys)].copy()
pit_r = pit_df[pit_df['session_key'].isin(race_keys)].copy()
res_r = session_result_df[
    session_result_df['session_key'].isin(race_keys)
    & (session_result_df['number_of_laps'].fillna(0) > 0)
][['session_key', 'driver_number', 'number_of_laps']].copy()

In [23]:
keys = ['session_key', 'driver_number']

st_drv = (
    st.assign(stint_len = st['lap_end'] - st['lap_start'] + 1)
    .sort_values(keys + ['stint_number'])
    .groupby(keys, as_index = False)
    .agg(
        n_stints = ('stint_number', 'max'),
        stint_laps_total = ('stint_len', 'sum'),
        stints_first_pit_lap = ('lap_end', lambda s: s.iloc[0] + 1),
        compound_seq = ('compound', lambda s: '-'.join(s.astype('string')))
    )
)

pits = (
    pit_r.groupby(keys)['lap_number']
    .agg(pit_stop_count = 'nunique', first_pit_lap = 'min')
    .reset_index()
)

strategy_check = (
    st_drv.merge(pits, on = keys, how = 'left')
    .merge(res_r.rename(columns = {'number_of_laps': 'laps_completed'}), on = keys, how = 'left')
    .assign(
        expected_stops = lambda d: d['n_stints'].sub(1).clip(lower = 0),
        pit_stop_count = lambda d: d['pit_stop_count'].fillna(0).astype(int),
        stops_diff = lambda d: (d['pit_stop_count'] - d['expected_stops']).abs(),
        laps_diff = lambda d: (d['laps_completed'] - d['stint_laps_total']).abs(),
        first_pit_offset = lambda d: d['first_pit_lap'] - d['stints_first_pit_lap'],
    )
)

strategy_check.head()

Unnamed: 0,session_key,driver_number,n_stints,stint_laps_total,stints_first_pit_lap,compound_seq,pit_stop_count,first_pit_lap,laps_completed,expected_stops,stops_diff,laps_diff,first_pit_offset
0,7779,1,2,50,19,MEDIUM-HARD,0,,50,1,1,0,
1,7779,2,2,50,19,HARD-MEDIUM,0,,50,1,1,0,
2,7779,4,3,50,3,SOFT-HARD-MEDIUM,0,,50,2,2,0,
3,7779,10,2,50,15,MEDIUM-HARD,0,,50,1,1,0,
4,7779,11,2,50,19,MEDIUM-HARD,0,,50,1,1,0,


In [26]:
issues_df = strategy_check.loc[
    (strategy_check['stops_diff'] != 0)
    | (strategy_check['laps_diff'] > 1)
    | (strategy_check['first_pit_offset'].abs() > 1),
    [
        'session_key', 'driver_number', 'compound_seq', 'n_stints', 'expected_stops',
        'pit_stop_count', 'stint_laps_total', 'laps_completed',
        'stints_first_pit_lap', 'first_pit_lap', 'first_pit_offset',
        'stops_diff', 'laps_diff'
    ]
].reset_index(drop = True)

issues_df.head(2000)

Unnamed: 0,session_key,driver_number,compound_seq,n_stints,expected_stops,pit_stop_count,stint_laps_total,laps_completed,stints_first_pit_lap,first_pit_lap,first_pit_offset,stops_diff,laps_diff
0,7779,1,MEDIUM-HARD,2,1,0,50,50,19,,,1,0
1,7779,2,HARD-MEDIUM,2,1,0,50,50,19,,,1,0
2,7779,4,SOFT-HARD-MEDIUM,3,2,0,50,50,3,,,2,0
3,7779,10,MEDIUM-HARD,2,1,0,50,50,15,,,1,0
4,7779,11,MEDIUM-HARD,2,1,0,50,50,19,,,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
256,10033,30,HARD-MEDIUM,2,1,1,36,36,32,28,-4,0,0
257,10033,44,HARD-MEDIUM,2,1,1,57,57,33,28,-5,0,0
258,10033,55,MEDIUM-HARD,2,1,1,57,57,33,25,-8,0,0
259,10033,63,HARD-MEDIUM,2,1,1,57,57,33,29,-4,0,0
