In [12]:
import fastf1
import os

# 1. Point to your designated "Master" folder
# Use the full path to be absolutely certain
master_cache_dir = os.path.abspath('../data/raw/') 

if not os.path.exists(master_cache_dir):
    os.makedirs(master_cache_dir)

fastf1.Cache.enable_cache(master_cache_dir)

# 2. THE VERIFICATION TEST
# We attempt to load Abu Dhabi. 
# - If it's already there, it loads in 0.1 seconds from disk.
# - If it's not there, FastF1 downloads it to THIS folder.
try:
    print(f"üîÑ Checking {master_cache_dir} for Abu Dhabi data...")
    test_session = fastf1.get_session(2025, 'Abu Dhabi', 'FP2')
    test_session.load(telemetry=False) 
    print("‚úÖ SUCCESS: Abu Dhabi is now locked into the Master Cache.")
except Exception as e:
    print(f"‚ùå Could not load session: {e}")

print(f"üìç All future race data will now be stored in: {master_cache_dir}")

üîÑ Checking c:\Users\trive\Desktop\vaanishka\Data Science\f1_pred\data\raw for Abu Dhabi data...


core           INFO 	Loading data for Abu Dhabi Grand Prix - Practice 2 [v3.7.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for _extended_timing_data. Loading data...
_api           INFO 	Fetching timing data...
_api           INFO 	Parsing timing data...
req            INF

‚úÖ SUCCESS: Abu Dhabi is now locked into the Master Cache.
üìç All future race data will now be stored in: c:\Users\trive\Desktop\vaanishka\Data Science\f1_pred\data\raw


# Creating Processed Data to work on

In [None]:
import pandas as pd
import fastf1

# 1. Load the session (from your newly fixed cache)
fastf1.Cache.enable_cache('../data/raw')
session = fastf1.get_session(2024, 'Abu Dhabi', 'FP2')
session.load(telemetry=False)

# 2. THE SCRUB: Filter for high-quality data
# We remove laps that aren't 'Accurate' and laps with pit stops
clean_laps = session.laps.pick_accurate().copy()

# 3. Add a 'Fuel-Corrected' column (using your rules.json)
# (Assuming you've loaded your rules.json as 'rules' earlier in the notebook)
# For now, let's just mark them for processing
clean_laps['Is_Model_Ready'] = True

# 4. SAVE to Processed folder
# This is where your model will eventually 'eat' from
output_path = '../data/processed/abu_dhabi_clean.csv'
clean_laps.to_csv(output_path, index=False)

print(f"‚úÖ Scrubbing Complete! {len(clean_laps)} laps saved to {output_path}")

core           INFO 	Loading data for Abu Dhabi Grand Prix - Practice 2 [v3.7.0]
req            INFO 	Using cached data for session_info
req            INFO 	Using cached data for driver_info


req            INFO 	Using cached data for session_status_data
req            INFO 	Using cached data for track_status_data
req            INFO 	Using cached data for _extended_timing_data
req            INFO 	Using cached data for timing_app_data
core           INFO 	Processing timing data...
req            INFO 	Using cached data for weather_data
req            INFO 	Using cached data for race_control_messages
core           INFO 	Finished loading data for 20 drivers: ['1', '4', '10', '11', '14', '16', '18', '20', '22', '23', '24', '27', '30', '43', '44', '55', '61', '63', '77', '81']


‚úÖ Scrubbing Complete! 312 laps saved to ../data/processed/abu_dhabi_clean.csv


# FUEL CORRECTION
**The Logic: "Subtracting the Weight"**
We know the car starts the race with roughly 100kg of fuel and ends with almost 0kg.

- The code will calculate which lap number we are on.
- It will estimate the fuel remaining.
- It will multiply that weight by your fuel_weight_penalty_per_10kg (0.3s).
- It will subtract that "penalty" from the actual lap time.

In [14]:
import pandas as pd
import json

# 1. Load your Rules and your Cleaned Data
with open('../coefficient/rules.json', 'r') as f:
    rules = json.load(f)

df = pd.read_csv('../data/processed/abu_dhabi_clean.csv')

# 2. Extract the constant from your JSON
penalty_per_10kg = rules['physics_constants']['fuel_weight_penalty_per_10kg']
fuel_burn_per_lap = rules['physics_constants']['average_fuel_burn_per_lap_kg']

# 3. Calculate Fuel-Corrected Time
# We assume a full tank (100kg) at Lap 1
def calculate_fuel_correction(row):
    laps_remaining = df['LapNumber'].max() - row['LapNumber']
    fuel_on_board = laps_remaining * fuel_burn_per_lap
    
    # Correction in seconds: (Weight / 10) * Penalty
    correction_seconds = (fuel_on_board / 10) * penalty_per_10kg
    
    # Subtract the penalty from the actual lap time
    # (Note: We convert LapTime to total seconds first)
    actual_seconds = pd.to_timedelta(row['LapTime']).total_seconds()
    return actual_seconds - correction_seconds

df['Naked_LapTime_Seconds'] = df.apply(calculate_fuel_correction, axis=1)

# 4. Save this 'Feature Engineered' data
df.to_csv('../data/processed/abu_dhabi_clean.csv', index=False)

print("‚úÖ Fuel Normalization Complete.")
print(f"Sample: Lap 1 actual time was {df.iloc[0]['LapTime']}, Naked time is {df.iloc[0]['Naked_LapTime_Seconds']:.3f}s")

‚úÖ Fuel Normalization Complete.
Sample: Lap 1 actual time was 0 days 00:01:25.980000, Naked time is 84.638s


# Tire normalisation

In [15]:
# 1. Map your specific JSON keys to the Tire Compounds
# This connects 'SOFT' from the telemetry to 'soft_slope' from your JSON
deg_rules = rules.get('degradation_coefficients', {})

compound_map = {
    'SOFT': deg_rules.get('soft_slope', 0.08),
    'MEDIUM': deg_rules.get('medium_slope', 0.05),
    'HARD': deg_rules.get('hard_slope', 0.03)
}

def calculate_tire_correction(row):
    # Get the compound (e.g., 'SOFT') and the age (e.g., 5 laps)
    compound = str(row['Compound']).upper()
    age = row['TyreLife']
    
    # Get the slope from our map; if it's a 'TEST' tire, default to Medium (0.05)
    slope = compound_map.get(compound, 0.05)
    
    wear_tax = age * slope
    return row['Naked_LapTime_Seconds'] - wear_tax

# Apply the correction
df['Ultimate_Baseline_Seconds'] = df.apply(calculate_tire_correction, axis=1)

# 2. Save the final processed data
df.to_csv('../data/processed/abu_dhabi_clean.csv', index=False)

print("‚úÖ Tire Normalization Fixed!")
print(f"Using your slopes: Soft={compound_map['SOFT']}s, Medium={compound_map['MEDIUM']}s, Hard={compound_map['HARD']}s")

‚úÖ Tire Normalization Fixed!
Using your slopes: Soft=0.08s, Medium=0.05s, Hard=0.03s


# foundation : Rolling Best, Overdrive, etc.

In [22]:
import pandas as pd
import numpy as np
import json
import os

# 1. FIXED PATH: Pointing to the coefficients folder
# Adjust the '../' depending on your current working directory
json_path = '../coefficient/track_dna.json'

try:
    with open(json_path, 'r') as f:
        track_dna = json.load(f)
except FileNotFoundError:
    print(f"‚ùå Error: Could not find track_dna.json at {json_path}")
    # Optional: stop execution if the file is missing
    raise

# 2. Load the cleaned session data
df = pd.read_csv('../data/processed/abu_dhabi_clean.csv')

# 3. Pull DNA based on the 'EventName' column
# We use .get() with a default to prevent the script from crashing
event_key = df['EventName'].iloc[0] if 'EventName' in df.columns else "Abu Dhabi Grand Prix"
dna = track_dna.get(event_key, track_dna.get("Abu Dhabi Grand Prix"))

# --- FEATURE ENGINEERING ENGINE ---
tire_slopes = {'SOFT': 0.082, 'MEDIUM': 0.045, 'HARD': 0.021}

def engineer_lap_features(row):
    try:
        # Handle potential time format errors
        actual_time = pd.to_timedelta(row['LapTime']).total_seconds()
        
        # Ensure LapNumber is a number
        lap_num = float(row.get('LapNumber', 1))
        
        # A. Fuel Normalization
        current_fuel = max(0, 100 - (lap_num * dna['fuel_burn_lap']))
        fuel_correction = current_fuel * dna['penalty_per_kg']
        
        # B. Tire Degradation
        compound = str(row.get('Compound', 'SOFT')).upper()
        tire_correction = float(row.get('TyreLife', 1)) * tire_slopes.get(compound, 0.045)
        
        # C. Mechanical & Environment
        drs_boost = 0.85 if row.get('DRS_Active') == 1 else 0
        weather_tax = max(0, (float(row.get('AirTemp', 28)) - 20) * 0.015)

        # Naked Pace Calculation
        naked_pace = actual_time - fuel_correction - tire_correction + drs_boost - weather_tax
        
        return pd.Series([naked_pace, fuel_correction, tire_correction, drs_boost])
    except Exception as e:
        # This catches specific row errors without stopping the whole script
        return pd.Series([np.nan] * 4)

# Apply the Physics Layer
df[['Naked_Pace', 'Fuel_Tax', 'Tire_Tax', 'DRS_Correction']] = df.apply(engineer_lap_features, axis=1)
df = df.dropna(subset=['Naked_Pace'])

# --- BEHAVIORAL LAYER ---
# Peak Potential: The true ceiling (Best in 3-lap window)
df['Peak_Potential'] = df.groupby('Driver')['Naked_Pace'].transform(lambda x: x.rolling(window=3, min_periods=1).min())

# Sandbagging Delta: Identifying hidden pace
df['Sandbag_Delta'] = df['Naked_Pace'] - df['Peak_Potential']

# Track Evolution: FIXED logic to use row-wise lap number safely
df['Final_Baseline'] = df['Naked_Pace'] - (df['LapNumber'].astype(float) * 0.008)

# 4. Save the Model-Ready dataset
df.to_csv('../data/processed/master_baseline.csv', index=False)

print(f"‚úÖ Master Baseline successfully generated for: {event_key}")

‚úÖ Master Baseline successfully generated for: Abu Dhabi Grand Prix


# Sector Analysis Script 
**Why this matters for 2026 Model**
- **Chassis Score:** If a team (like McLaren) has a low S3_Chassis_Score, they will benefit more from the 2026 weight reduction.
- **Clipping Risk:** Teams with high top-end power today will face the biggest challenge in 2026 when the electrical boost runs out at $340\text{km/h}$.
**Sandbag-Proofing:** Drivers can't easily sandbag in Sector 3 without looking "slow" in the data. If their S3 is elite but their S1 is poor, we know they have their engine turned down.

In [3]:
import pandas as pd
import numpy as np

# 1. Load the Master Baseline
df = pd.read_csv('../data/processed/master_baseline.csv')

# --- FIX: Convert Sector Times to Numeric Seconds ---
# This ensures we aren't trying to divide "str" by "str"
sector_cols = ['Sector1Time', 'Sector2Time', 'Sector3Time']

for col in sector_cols:
    # Convert to timedelta then to total seconds
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# 2. Pivot the data to see Sector Performance
sector_perf = df.groupby(['Driver', 'Compound']).agg({
    'Sector1Time': 'min',
    'Sector2Time': 'min',
    'Sector3Time': 'min',
    'Sandbag_Delta': 'mean' 
}).reset_index()

# 3. Calculate "Sector Dominance" (Safe from TypeErrors now)
# We divide by the fastest time in that sector to get a ratio (1.0 = fastest)
sector_perf['S3_Chassis_Score'] = sector_perf['Sector3Time'] / sector_perf['Sector3Time'].min()
sector_perf['S1_Power_Score'] = sector_perf['Sector1Time'] / sector_perf['Sector1Time'].min()

# 4. 2026 Prediction Feature: "The Clipping Risk"
sector_perf['2026_Clipping_Risk'] = sector_perf['S1_Power_Score'] * (1 + sector_perf['Sandbag_Delta'])

# 5. Save the Sector DNA
sector_perf.to_csv('../data/processed/sector_analysis.csv', index=False)

print("‚úÖ Type Error Squashed! Sector Analysis Complete.")
print(sector_perf.head())

‚úÖ Type Error Squashed! Sector Analysis Complete.
  Driver Compound  Sector1Time  Sector2Time  Sector3Time  Sandbag_Delta  \
0    ALB   MEDIUM       17.514       36.583       31.210      19.506133   
1    ALB     SOFT       17.223       36.299       30.747      11.696211   
2    ALO   MEDIUM       17.574       36.758       31.137       6.379250   
3    ALO     SOFT       17.264       36.262       30.912      18.582314   
4    BOT   MEDIUM       17.610       36.651       31.087      12.499056   

   S3_Chassis_Score  S1_Power_Score  2026_Clipping_Risk  
0          1.026746        1.028360           21.087688  
1          1.011514        1.011274           12.839343  
2          1.024345        1.031883            7.614523  
3          1.016942        1.013681           19.850219  
4          1.022700        1.033997           13.957981  


In [25]:
import pandas as pd
import numpy as np

# Load our previous master file
df = pd.read_csv('../data/processed/master_baseline.csv')

# Ensure Sector times are numeric (Fixes your previous error)
for col in ['Sector1Time', 'Sector2Time', 'Sector3Time']:
    df[col] = pd.to_timedelta(df[col]).dt.total_seconds()

# --- 1. OVERDRIVE SCORE (œÉ) ---
# We calculate consistency per driver, per stint
df['Overdrive_Score'] = df.groupby(['Driver', 'Stint'])['Naked_Pace'].transform('std')

# --- 2. TERRAIN EDGE INDEX ---
# Identify which drivers are gaining time in Technical (S3) vs Power (S1) sectors
# Lower score = faster relative to the field
df['S1_Ratio'] = df['Sector1Time'] / df['Sector1Time'].min()
df['S3_Ratio'] = df['Sector3Time'] / df['Sector3Time'].min()
df['Terrain_Edge'] = df['S3_Ratio'] - df['S1_Ratio'] 
# Positive Edge = Stronger in corners (Good for 2026 weight reduction)
# Negative Edge = Stronger on straights (Risky for 2026 energy clipping)

# --- 3. GRID POSITION INTEGRATION ---
# Usually, we'd join this from a separate 'Qualifying' session file
# For now, let's create a placeholder that simulates the Grid Rank
# (In your real pipeline, you would merge df with session.results[['Driver', 'GridPosition']])
df['Grid_Position'] = df.groupby('Driver')['Naked_Pace'].transform('min').rank(method='min')

# --- FINAL AGGREGATION FOR ML ---
# We want one row per driver that sums up their '2024 Profile'
model_ready = df.groupby('Driver').agg({
    'Naked_Pace': 'min',        # Their absolute peak
    'Overdrive_Score': 'mean',  # Their average consistency
    'Terrain_Edge': 'mean',     # Their car's DNA
    'Grid_Position': 'first',   # Where they start
    'Sandbag_Delta': 'max'      # How much they were hiding
}).reset_index()

model_ready.to_csv('../data/processed/model_training_set.csv', index=False)

print("üèÜ All Advanced Features Engineered!")
print(model_ready.head())

üèÜ All Advanced Features Engineered!
  Driver  Naked_Pace  Overdrive_Score  Terrain_Edge  Grid_Position  \
0    ALB     82.0960        19.169117     -0.023870          148.0   
1    ALO     82.3449        10.855169     -0.084608          201.0   
2    BOT     81.9522        18.353978     -0.028193           45.0   
3    COL     82.7616        24.477945      0.143069          372.0   
4    DOO     82.8441        20.358792     -0.077092          380.0   

   Sandbag_Delta  
0        46.2961  
1        44.5962  
2        55.8171  
3        60.1661  
4        59.9441  


**Pulling race data from the api to get data for dirty air an dslip stream added the calculated coefficients to data/processed/multi_race_traffic.csv**

In [4]:
import fastf1
import pandas as pd
import numpy as np

# 1. Load Monza and Hungary without heavy telemetry
traffic_races = [(2025, 'Monza', 'R'), (2025, 'Hungary', 'R')]
traffic_list = []

for yr, loc, sess_type in traffic_races:
    s = fastf1.get_session(yr, loc, sess_type)
    s.load(telemetry=False, weather=False) # Lightweight load
    laps = s.laps.copy()
    # Calculate Gap for traffic detection logic
    laps['GapToPrev'] = laps.sort_values(by=['LapNumber', 'Time']).groupby('LapNumber')['Time'].diff().dt.total_seconds()
    traffic_list.append(laps)

df_traffic = pd.concat(traffic_list).reset_index(drop=True)
df_traffic['Is_Following'] = np.where(df_traffic['GapToPrev'] < 1.2, 1, 0)

# 2. Calculate the "Missing" columns for the joiner
# Map clean air max speed to every driver row
clean_speeds = df_traffic[df_traffic['Is_Following'] == 0].groupby('Driver')['SpeedST'].max().rename('CleanMax')
df_traffic = df_traffic.merge(clean_speeds, on='Driver', how='left')

# The column your joiner is missing:
df_traffic['Slipstream_Delta'] = np.where(df_traffic['Is_Following'] == 1, df_traffic['SpeedST'] - df_traffic['CleanMax'], 0)
df_traffic['Dirty_Air_Tax'] = df_traffic.groupby('Driver')['LapTime'].transform(lambda x: x.dt.total_seconds() - x.dt.total_seconds().min())

# 3. Save the fixed file to disk
df_traffic.to_csv('../data/processed/multi_race_traffic.csv', index=False)
print("‚úÖ multi_race_traffic.csv is now fixed and saved with Slipstream_Delta.")

core           INFO 	Loading data for Italian Grand Prix - Race [v3.7.0]
req            INFO 	No cached data found for session_info. Loading data...
_api           INFO 	Fetching session info data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for driver_info. Loading data...
_api           INFO 	Fetching driver list...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for session_status_data. Loading data...
_api           INFO 	Fetching session status data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for lap_count. Loading data...
_api           INFO 	Fetching lap count data...
req            INFO 	Data has been written to cache!
req            INFO 	No cached data found for track_status_data. Loading data...
_api           INFO 	Fetching track status data...
req            INFO 	Data has been written to cache!
req            INFO 	No ca

‚úÖ multi_race_traffic.csv is now fixed and saved with Slipstream_Delta.


**merging df to point to the right location **

**checking**

In [7]:
import pandas as pd

# Load the file
df_sector = pd.read_csv('../data/processed/sector_analysis.csv')

# Use actual performance metrics as the 2026 foundation
# We are mapping Sector Times to the features the XGBoost model expects
df_sector['power_score'] = df_sector['Sector3Time']   # Real speed data
df_sector['chassis_score'] = df_sector['Sector2Time'] # Real handling data
df_sector['clipping_risk'] = df_sector['Sector1Time'].pct_change().fillna(0)

# Save the accurate foundation
df_sector.to_csv('../data/processed/sector_analysis.csv', index=False)
print("‚úÖ Sector data verified and updated for 2026 training.")

‚úÖ Sector data verified and updated for 2026 training.


# SESSION JOIN

This code is the "Session Join" because it unifies separate physics, traffic, and tactical files into a single master table, allowing the XGBoost model to see how variables like Grid Position and Edge Index interact to determine the final race result.

In [11]:
import pandas as pd

# 1. Load the "Truth" data (Meta/Targets)
df_meta = pd.read_csv('../data/processed/model_training_set.csv')

# 2. Load the "Performance" data (Features)
df_baseline = pd.read_csv('../data/processed/master_baseline.csv')
df_traffic = pd.read_csv('../data/processed/multi_race_traffic.csv')
df_sector = pd.read_csv('../data/processed/sector_analysis.csv')

# 3. Aggregations (Ensuring we use your specific S3/S1 names)
traffic_profile = df_traffic.groupby('Driver').agg({
    'Dirty_Air_Tax': 'mean',
    'Slipstream_Delta': 'max'
}).reset_index()

# Sort by LapTime to get the peak potential for each driver
df_sector = df_sector.sort_values(by=['Driver', 'Sector1Time'], ascending=True)
sector_profile = df_sector.groupby('Driver').agg({
    'S3_Chassis_Score': 'first',      
    'S1_Power_Score': 'first',        
    '2026_Clipping_Risk': 'mean'      
}).reset_index()
sector_profile.columns = ['Driver', 'chassis_score', 'power_score', 'clipping_risk']

# 4. THE GRAND MERGE (Including the Grid/Terrain/Overdrive data)
# First, merge the features together
features = pd.merge(df_baseline, traffic_profile, on='Driver', how='inner')
features = pd.merge(features, sector_profile, on='Driver', how='inner')

# SECOND, join with the Meta/Target file
# This attaches: GridPosition, Terrain, Overdrive_Naked_Score, and Sandbag_Delta
final_df = pd.merge(df_meta, features, on='Driver', how='inner')

# 5. Lock it down
final_df.to_csv('../data/final_db/final_training_data_2026.csv', index=False)

print("üèÅ MASTER DATABASE LOCKED.")
print(f"Verified Columns: {list(final_df.columns)}")

üèÅ MASTER DATABASE LOCKED.
Verified Columns: ['Driver', 'Naked_Pace_x', 'Overdrive_Score', 'Terrain_Edge', 'Grid_Position', 'Sandbag_Delta_x', 'Time', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint', 'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time', 'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime', 'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest', 'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime', 'LapStartDate', 'TrackStatus', 'Position', 'Deleted', 'DeletedReason', 'FastF1Generated', 'IsAccurate', 'DRS_Active', 'Naked_Pace_y', 'Fuel_Tax', 'Tire_Tax', 'DRS_Correction', 'Peak_Potential', 'Sandbag_Delta_y', 'Final_Baseline', 'Dirty_Air_Tax', 'Slipstream_Delta', 'chassis_score', 'power_score', 'clipping_risk']
