In [6]:
# Importando bibliotecas
import fastf1
fastf1.ergast.interface.BASE_URL = "https://api.jolpi.ca/ergast/f1" # type: ignore
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
from typing import Tuple, List, Optional
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.multioutput import MultiOutputClassifier, MultiOutputRegressor
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, mean_absolute_error, mean_squared_error,
    r2_score, explained_variance_score
)

In [7]:
stints = pd.read_csv("..\\data\\stints_2019-2024.csv", index_col=False)
stints

Unnamed: 0,Driver,Year,Circuit,nLaps,NumberOfStints,CompoundStrategy,StintLengthStrategy,Team,StartingPosition,AirTemp,...,TrackTemp,WindDirection,WindSpeed,mean_brake_time,std_brake_time,AvgSpeed,StdSpeed,AvgSpeedDelta,StdSpeedDelta,AvgGearChanges
0,ALB,2019,Yas Marina Circuit,58,2,"['MEDIUM', 'HARD']","[13, 42]",Red Bull Racing,5,26.018045,...,29.566165,221.097744,1.264662,20.155587,1.190986,190.146518,4.104201,246.565018,15.577114,48.759158
1,ALB,2019,Albert Park Circuit,58,2,"['SOFT', 'MEDIUM']","[14, 43]",Toro Rosso,13,23.477869,...,41.313115,155.327869,1.166393,16.107622,2.045714,207.657799,6.325757,225.931096,19.118794,45.691860
2,ALB,2019,Red Bull Ring,71,2,"['MEDIUM', 'HARD']","[35, 35]",Toro Rosso,18,34.444068,...,50.744068,188.855932,1.225424,13.635129,2.089451,219.187816,6.962654,240.657143,15.165511,38.714286
3,ALB,2019,Baku City Circuit,51,2,"['SOFT', 'MEDIUM']","[12, 38]",Toro Rosso,11,19.763566,...,39.654264,167.480620,1.249612,26.073193,2.424288,193.901751,7.891566,269.989031,16.208025,65.037281
4,ALB,2019,Bahrain International Circuit,57,3,"['SOFT', 'SOFT', 'MEDIUM']","[9, 16, 32]",Toro Rosso,12,26.210853,...,28.610853,60.720930,2.710853,21.511148,2.543971,193.331782,9.221221,258.327546,16.347423,55.267361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1574,ZHO,2024,Autodromo Nazionale Monza,53,2,"['MEDIUM', 'HARD']","[15, 37]",Kick Sauber,20,33.101504,...,49.311278,200.624060,1.518797,14.722250,2.033569,237.439706,8.250280,271.856757,13.451532,38.919820
1575,ZHO,2024,Autódromo Hermanos Rodríguez,71,2,"['HARD', 'MEDIUM']","[43, 27]",Kick Sauber,19,19.888679,...,35.602516,291.352201,2.150943,19.938504,2.499049,179.173394,14.764579,262.885874,14.995403,35.329457
1576,ZHO,2024,Marina Bay Street Circuit,61,2,"['HARD', 'MEDIUM']","[34, 27]",Kick Sauber,20,30.786250,...,36.445625,192.493750,0.955000,24.642124,1.980963,176.032457,5.536413,233.619826,12.504193,50.650871
1577,ZHO,2024,Circuit de Barcelona-Catalunya,66,3,"['SOFT', 'MEDIUM', 'HARD']","[9, 32, 24]",Kick Sauber,15,24.132468,...,41.096104,207.370130,2.123377,17.048671,2.256758,202.696363,7.376482,225.560185,18.007868,36.703704


In [13]:
stints['CompoundStrategy'].value_counts()

CompoundStrategy
['MEDIUM', 'HARD']              445
['MEDIUM', 'HARD', 'HARD']      150
['MEDIUM', 'HARD', 'MEDIUM']    112
['SOFT', 'MEDIUM']              108
['SOFT', 'MEDIUM', 'SOFT']       91
['SOFT', 'HARD']                 90
['HARD', 'MEDIUM']               88
['MEDIUM', 'HARD', 'SOFT']       73
['MEDIUM', 'SOFT']               59
['MEDIUM', 'MEDIUM', 'HARD']     53
['SOFT', 'MEDIUM', 'HARD']       48
['SOFT', 'MEDIUM', 'MEDIUM']     47
['SOFT', 'HARD', 'MEDIUM']       41
['SOFT', 'HARD', 'HARD']         25
['SOFT', 'HARD', 'SOFT']         17
['HARD', 'MEDIUM', 'MEDIUM']     15
['MEDIUM', 'MEDIUM', 'SOFT']     14
['HARD', 'SOFT']                 12
['MEDIUM', 'HARD', nan]          12
['SOFT', 'SOFT', 'MEDIUM']       12
['MEDIUM', 'SOFT', 'SOFT']       10
['HARD', 'HARD', 'MEDIUM']       10
['HARD', 'MEDIUM', 'HARD']       10
['HARD', 'MEDIUM', 'SOFT']        9
['SOFT', 'SOFT', 'HARD']          6
['MEDIUM', 'SOFT', 'MEDIUM']      5
['MEDIUM', 'MEDIUM']              5
['SOFT', 'S

In [10]:
# show rows where the 'CompoundStrategy' column contains 'nan' in the list
stints[stints['CompoundStrategy'].apply(lambda compounds: any(pd.isna(compound) for compound in compounds))]

Unnamed: 0,Driver,Year,Circuit,nLaps,NumberOfStints,CompoundStrategy,StintLengthStrategy,Team,StartingPosition,AirTemp,...,TrackTemp,WindDirection,WindSpeed,mean_brake_time,std_brake_time,AvgSpeed,StdSpeed,AvgSpeedDelta,StdSpeedDelta,AvgGearChanges


In [None]:
def encode_categorical_features(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    """
    Encodes categorical features using Label Encoding.
    """
    for col in columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    return df

In [5]:
def prepare_df(df: pd.DataFrame, max_stints: int = 3) -> pd.DataFrame:
    """
    Prepares the DataFrame for modeling:
    1. Filters out entries with more than max_stints.
    2. Pads the 'Compounds' and 'StintLengths' lists.
    3. Explodes these lists into separate columns for each stint.
    """
    df_copy = df[df['NumberOfStints'] > 1 & df['NumberOfStints'] <= max_stints].copy()

    # 2. Pad lists to ensure uniform length
    df_copy['CompoundStrategy_padded'] = df_copy['CompoundStrategy'].map(
        lambda lst: lst + ['NONE'] * (max_stints - len(lst))
    )
    df_copy['StintLengths_padded'] = df_copy['StintLengthStrategy'].map(
        lambda lst: lst + [0] * (max_stints - len(lst)) # Using 0 as padding for lengths
    )

    # 3. Explode lists into separate columns for each stint
    for i in range(max_stints):
        df_copy[f'Compound_{i+1}'] = df_copy['Compounds_padded'].str[i]
        df_copy[f'Length_{i+1}']   = df_copy['Lengths_padded'].str[i]
    return df_copy

In [14]:
def build_feature_matrix(df: pd.DataFrame, max_stints: int) -> Tuple[
    pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, 
    pd.DataFrame, pd.DataFrame, List[LabelEncoder], StandardScaler
]:
    """
    Builds feature and target matrices, splits data, and preprocesses:
    1. Defines feature and target column names based on max_stints.
    2. Splits data into training and testing sets *before* any fitting.
    3. Scales numeric features (StandardScaler fit on train, transform train/test).
    4. Encodes categorical compound targets (LabelEncoder fit on train, transform train/test).
    Returns:
        X_train_scaled, X_test_scaled,
        yc_train_encoded, yc_test_encoded,
        yl_train, yl_test,
        encoders (for compounds), scaler (for features)
    """
    # Define feature columns (ensure these columns exist in df)
    # Your original feature set:
    feature_cols = ['Year','StartingPosition','AirTemp','Humidity',
                    'Pressure','Rainfall','TrackTemp','WindDirection','WindSpeed']
    # Consider adding 'Driver', 'Circuit', 'Team' after appropriate encoding (e.g., OneHotEncoder)
    
    # Define target column names dynamically using max_stints
    compound_target_cols = [f'Compound_{i+1}' for i in range(max_stints)]
    length_target_cols = [f'Length_{i+1}' for i in range(max_stints)]

    # Ensure all specified columns are present in the DataFrame
    missing_cols = [col for col in feature_cols + compound_target_cols + length_target_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing columns in DataFrame: {missing_cols}")

    X = df[feature_cols]
    y_comp_raw = df[compound_target_cols].copy() # Raw compound strings
    y_len = df[length_target_cols].copy()        # Lengths are already numeric

    # Split data *before* any fitting to prevent data leakage
    X_train, X_test, \
    yc_train_raw, yc_test_raw, \
    yl_train, yl_test = train_test_split(
        X, y_comp_raw, y_len,
        test_size=0.20,
        random_state=72,
        shuffle=True
    )

    # --- Preprocess Features (Scaling) ---
    scaler = StandardScaler()
    # Fit scaler ONLY on training data's numeric features
    X_train_scaled = pd.DataFrame(
        scaler.fit_transform(X_train), 
        columns=X_train.columns, 
        index=X_train.index
    )
    # Transform test data using the SAME fitted scaler
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test), 
        columns=X_test.columns, 
        index=X_test.index
    )

    # --- Preprocess Compound Targets (Label Encoding) ---
    encoders: List[LabelEncoder] = []
    yc_train_encoded = yc_train_raw.copy()
    yc_test_encoded = yc_test_raw.copy()

    all_compounds = ['SOFT', 'MEDIUM', 'HARD', 'NONE', 'WET', 'INTERMEDIATE']
    for col in compound_target_cols:
        le = LabelEncoder()
        # Fit LabelEncoder ONLY on the training data for this target column
        le.fit(all_compounds)
        
        # Transform both training and testing data for this column
        yc_train_encoded[col] = le.transform(yc_train_raw[col])
        yc_test_encoded[col] = le.transform(yc_test_raw[col])
        
        encoders.append(le)
    
    return X_train_scaled, X_test_scaled, yc_train_encoded, yc_test_encoded, yl_train, yl_test, encoders, scaler

In [15]:
def train_and_eval(X_train: pd.DataFrame, X_test: pd.DataFrame,
                   yc_train: pd.DataFrame, yc_test: pd.DataFrame,  # Encoded compound targets
                   yl_train: pd.DataFrame, yl_test: pd.DataFrame,  # Stint length targets
                   encoders: List[LabelEncoder]  # <-- New argument: list of fitted LabelEncoders for compounds
                   ) -> tuple: # Adjusted return type hint if you want to be specific
    """
    Trains compound and stint length models, evaluates them, 
    and creates DataFrames for side-by-side comparison of predictions and actuals.

    Args:
        X_train: Training features.
        X_test: Testing features.
        yc_train: Encoded training compound labels.
        yc_test: Encoded testing compound labels.
        yl_train: Training stint lengths.
        yl_test: Testing stint lengths.
        encoders: List of fitted LabelEncoder objects for each compound target column.

    Returns:
        A tuple containing:
            - clf_model: Trained multi-output classifier for compounds.
            - reg_model: Trained multi-output regressor for stint lengths.
            - compounds_comparison_df: DataFrame comparing actual and predicted compounds (decoded).
            - lengths_comparison_df: DataFrame comparing actual and predicted stint lengths.
    """
    # 1) Compounds model (RandomForestClassifier)
    clf_model = MultiOutputClassifier(RandomForestClassifier(
        n_estimators=100, random_state=72, n_jobs=-1
    ))
    clf_model.fit(X_train, yc_train)
    # yc_pred will contain encoded predictions
    yc_pred = pd.DataFrame(clf_model.predict(X_test), columns=yc_test.columns, index=X_test.index)

    # 2) Lengths model (RandomForestRegressor)
    reg_model = MultiOutputRegressor(RandomForestRegressor(
        n_estimators=100, random_state=72, n_jobs=-1
    ))
    reg_model.fit(X_train, yl_train)
    # yl_pred will contain float predictions
    yl_pred = pd.DataFrame(reg_model.predict(X_test), columns=yl_test.columns, index=X_test.index)

    # 3) Metrics
    # Compounds: overall accuracy on encoded labels
    comp_acc = np.mean([
        accuracy_score(yc_test[col], yc_pred[col])
        for col in yc_test.columns
    ])
    # Lengths: mean absolute error
    length_mae = mean_absolute_error(yl_test, yl_pred)

    print(f"Compound accuracy (avg over slots, encoded): {comp_acc:.3f}")
    print(f"Length MAE (all slots): {length_mae:.3f}")

    # 4) Create Comparison DataFrames

    # --- Compounds Comparison (Decoded) ---
    # `encoders` list should correspond to the order of columns in yc_test/yc_pred
    compound_comparison_cols = {}
    for i, col_name in enumerate(yc_test.columns):
        encoder = encoders[i] # Get the specific encoder for this compound stint
        compound_comparison_cols[f'{col_name}_Actual'] = encoder.inverse_transform(yc_test[col_name])
        compound_comparison_cols[f'{col_name}_Predicted'] = encoder.inverse_transform(yc_pred[col_name])
    
    compounds_comparison_df = pd.DataFrame(compound_comparison_cols, index=X_test.index)
    # Reorder columns for better side-by-side view if many stints
    ordered_compound_cols = []
    for col_name in yc_test.columns: # e.g., Compound_1, Compound_2
        ordered_compound_cols.append(f'{col_name}_Actual')
        ordered_compound_cols.append(f'{col_name}_Predicted')
    compounds_comparison_df = compounds_comparison_df[ordered_compound_cols]


    # --- Stint Lengths Comparison ---
    # Round predicted lengths to the nearest integer for better comparison
    yl_pred_rounded = yl_pred.round().astype(int)

    length_comparison_cols = {}
    for col_name in yl_test.columns: # e.g., Length_1, Length_2
        length_comparison_cols[f'{col_name}_Actual'] = yl_test[col_name]
        length_comparison_cols[f'{col_name}_Predicted'] = yl_pred_rounded[col_name]
        
    lengths_comparison_df = pd.DataFrame(length_comparison_cols, index=X_test.index)
    # Reorder columns
    ordered_length_cols = []
    for col_name in yl_test.columns:
        ordered_length_cols.append(f'{col_name}_Actual')
        ordered_length_cols.append(f'{col_name}_Predicted')
    lengths_comparison_df = lengths_comparison_df[ordered_length_cols]

    return clf_model, reg_model, compounds_comparison_df, lengths_comparison_df

In [16]:
def train_and_eval(X_train: pd.DataFrame, X_test: pd.DataFrame,
                   yc_train: pd.DataFrame, yc_test: pd.DataFrame,  # Encoded compound targets
                   yl_train: pd.DataFrame, yl_test: pd.DataFrame,  # Stint length targets
                   encoders: List[LabelEncoder]  # <-- New argument: list of fitted LabelEncoders for compounds
                   ) -> tuple: # Adjusted return type hint if you want to be specific
    """
    Trains compound and stint length models, evaluates them,
    and creates DataFrames for side-by-side comparison of predictions and actuals.

    Args:
        X_train: Training features.
        X_test: Testing features.
        yc_train: Encoded training compound labels.
        yc_test: Encoded testing compound labels.
        yl_train: Training stint lengths.
        yl_test: Testing stint lengths.
        encoders: List of fitted LabelEncoder objects for each compound target column.

    Returns:
        A tuple containing:
            - clf_model: Trained multi-output classifier for compounds.
            - reg_model: Trained multi-output regressor for stint lengths.
            - compounds_comparison_df: DataFrame comparing actual and predicted compounds (decoded).
            - lengths_comparison_df: DataFrame comparing actual and predicted stint lengths.
    """
    # 1) Compounds model (RandomForestClassifier)
    clf_model = MultiOutputClassifier(RandomForestClassifier(
        n_estimators=100, random_state=72, n_jobs=-1
    ))
    clf_model.fit(X_train, yc_train)
    # yc_pred will contain encoded predictions
    yc_pred = pd.DataFrame(clf_model.predict(X_test), columns=yc_test.columns, index=X_test.index)

    # 2) Lengths model (RandomForestRegressor)
    reg_model = MultiOutputRegressor(RandomForestRegressor(
        n_estimators=100, random_state=72, n_jobs=-1
    ))
    reg_model.fit(X_train, yl_train)
    # yl_pred will contain float predictions
    yl_pred = pd.DataFrame(reg_model.predict(X_test), columns=yl_test.columns, index=X_test.index)

    # 3) Metrics
    # === COMPOUND CLASSIFICATION METRICS ===
    # Accuracy (overall)
    comp_acc = np.mean([
        accuracy_score(yc_test[col], yc_pred[col])
        for col in yc_test.columns
    ])

    # Precision (macro average)
    comp_precision = np.mean([
        precision_score(yc_test[col], yc_pred[col], average='macro', zero_division=0)
        for col in yc_test.columns
    ])

    # Recall (macro average)
    comp_recall = np.mean([
        recall_score(yc_test[col], yc_pred[col], average='macro', zero_division=0)
        for col in yc_test.columns
    ])

    # F1-score (macro average)
    comp_f1 = np.mean([
        f1_score(yc_test[col], yc_pred[col], average='macro', zero_division=0)
        for col in yc_test.columns
    ])

    # Classification Report for first compound (as example)
    for col_name in yc_test.columns:
        print(f"\nClassification Report for {col_name}:")
        print(classification_report(yc_test[col_name], yc_pred[col_name]))

    print("\n" + "="*50)
    print("COMPOUND LABEL MAPPINGS:")
    for i, col_name in enumerate(yc_test.columns):
        print(f"\n{col_name} mapping:")
        encoder = encoders[i]
        unique_encoded = np.unique(yc_test[col_name])
        for encoded_val in unique_encoded:
            original_name = encoder.inverse_transform([encoded_val])[0]
            print(f"  {encoded_val} -> {original_name}")

    # === REGRESSION METRICS ===
    # Mean Absolute Error
    length_mae = mean_absolute_error(yl_test, yl_pred)

    # Mean Squared Error
    length_mse = mean_squared_error(yl_test, yl_pred)

    # Root Mean Squared Error
    length_rmse = np.sqrt(length_mse)

    # R² Score
    length_r2 = r2_score(yl_test, yl_pred)

    # Mean Absolute Percentage Error
    # Avoid division by zero by adding small epsilon
    epsilon = 1e-8
    # length_mape = np.mean(np.abs((yl_test - yl_pred) / (yl_test + epsilon))) * 100

    # Explained Variance Score
    length_evs = explained_variance_score(yl_test, yl_pred)

    # Print all metrics
    print("="*50)
    print("COMPOUND CLASSIFICATION METRICS:")
    print(f"Accuracy (avg over slots): {comp_acc:.3f}")
    print(f"Precision (macro avg): {comp_precision:.3f}")
    print(f"Recall (macro avg): {comp_recall:.3f}")
    print(f"F1-Score (macro avg): {comp_f1:.3f}")

    print("\n" + "="*50)
    print("STINT LENGTH REGRESSION METRICS:")
    print(f"Mean Absolute Error (MAE): {length_mae:.3f}")
    print(f"Mean Squared Error (MSE): {length_mse:.3f}")
    print(f"Root Mean Squared Error (RMSE): {length_rmse:.3f}")
    print(f"R² Score: {length_r2:.3f}")
    # print(f"Mean Absolute Percentage Error (MAPE): {length_mape:.2f}%")
    print(f"Explained Variance Score: {length_evs:.3f}")
    print("="*50)

    # 4) Create Comparison DataFrames

    # --- Compounds Comparison (Decoded) ---
    # `encoders` list should correspond to the order of columns in yc_test/yc_pred
    compound_comparison_cols = {}
    for i, col_name in enumerate(yc_test.columns):
        encoder = encoders[i] # Get the specific encoder for this compound stint
        compound_comparison_cols[f'{col_name}_Actual'] = encoder.inverse_transform(yc_test[col_name])
        compound_comparison_cols[f'{col_name}_Predicted'] = encoder.inverse_transform(yc_pred[col_name])

    compounds_comparison_df = pd.DataFrame(compound_comparison_cols, index=X_test.index)
    # Reorder columns for better side-by-side view if many stints
    ordered_compound_cols = []
    for col_name in yc_test.columns: # e.g., Compound_1, Compound_2
        ordered_compound_cols.append(f'{col_name}_Actual')
        ordered_compound_cols.append(f'{col_name}_Predicted')
    compounds_comparison_df = compounds_comparison_df[ordered_compound_cols]


    # --- Stint Lengths Comparison ---
    # Round predicted lengths to the nearest integer for better comparison
    yl_pred_rounded = yl_pred.round().astype(int)

    length_comparison_cols = {}
    for col_name in yl_test.columns: # e.g., Length_1, Length_2
        length_comparison_cols[f'{col_name}_Actual'] = yl_test[col_name]
        length_comparison_cols[f'{col_name}_Predicted'] = yl_pred_rounded[col_name]

    lengths_comparison_df = pd.DataFrame(length_comparison_cols, index=X_test.index)
    # Reorder columns
    ordered_length_cols = []
    for col_name in yl_test.columns:
        ordered_length_cols.append(f'{col_name}_Actual')
        ordered_length_cols.append(f'{col_name}_Predicted')
    lengths_comparison_df = lengths_comparison_df[ordered_length_cols]

    return clf_model, reg_model, compounds_comparison_df, lengths_comparison_df

In [17]:
def train_and_eval(X_train: pd.DataFrame, X_test: pd.DataFrame,
                   yc_train: pd.DataFrame, yc_test: pd.DataFrame,
                   yl_train: pd.DataFrame, yl_test: pd.DataFrame,
                   encoders: List[LabelEncoder]
                   ) -> tuple:
    """
    Trains compound and stint length models, evaluates them with detailed metrics,
    and creates DataFrames for side-by-side comparison.
    """
    # 1) & 2) Model Training (No changes here)
    # Compounds model
    clf_model = RandomForestClassifier(n_estimators=100, random_state=72, n_jobs=-1)
    clf_model.fit(X_train, yc_train)
    yc_pred = pd.DataFrame(clf_model.predict(X_test), columns=yc_test.columns, index=X_test.index)

    # Lengths model
    reg_model = RandomForestRegressor(n_estimators=100, random_state=72, n_jobs=-1)
    reg_model.fit(X_train, yl_train)
    yl_pred = pd.DataFrame(reg_model.predict(X_test), columns=yl_test.columns, index=X_test.index)

    # -----------------------------------------------------------
    # --- 3) NEW: Expanded Metrics Section ---
    # -----------------------------------------------------------
    print("--- Model Performance Metrics ---")

    # === Compound Classifier Metrics ===
    # Exact Match Ratio: How often was the ENTIRE compound strategy correct?
    exact_match_ratio = accuracy_score(yc_test, yc_pred)
    print(f"\nCompounds - Exact Match Ratio: {exact_match_ratio:.3f}")

    # Per-Stint Accuracy (your original metric, but now shown in detail)
    print("Compounds - Per-Stint Accuracy:")
    for col in yc_test.columns:
        acc = accuracy_score(yc_test[col], yc_pred[col])
        print(f"  - {col}: {acc:.3f}")

    # === Stint Length Regressor Metrics ===
    # Overall MAE (your original metric)
    overall_mae = mean_absolute_error(yl_test, yl_pred)
    print(f"\nLengths - Overall MAE: {overall_mae:.3f} laps")

    # Overall R-squared
    overall_r2 = r2_score(yl_test, yl_pred)
    print(f"Lengths - Overall R-squared (R²): {overall_r2:.3f}")

    # Per-Stint MAE
    print("Lengths - Per-Stint MAE:")
    for col in yl_test.columns:
        mae = mean_absolute_error(yl_test[col], yl_pred[col])
        print(f"  - {col}: {mae:.3f} laps")
    print("-" * 35)

    # 4) Create Comparison DataFrames (No changes needed here)
    # ... (rest of the function is identical) ...
    # --- Compounds Comparison (Decoded) ---
    compound_comparison_cols = {}
    for i, col_name in enumerate(yc_test.columns):
        encoder = encoders[i]
        compound_comparison_cols[f'{col_name}_Actual'] = encoder.inverse_transform(yc_test[col_name])
        compound_comparison_cols[f'{col_name}_Predicted'] = encoder.inverse_transform(yc_pred[col_name])

    compounds_comparison_df = pd.DataFrame(compound_comparison_cols, index=X_test.index)
    ordered_compound_cols = [col for pair in zip([f'{c}_Actual' for c in yc_test.columns], [f'{c}_Predicted' for c in yc_test.columns]) for col in pair]
    compounds_comparison_df = compounds_comparison_df[ordered_compound_cols]

    # --- Stint Lengths Comparison ---
    yl_pred_rounded = yl_pred.round().astype(int)
    length_comparison_cols = {}
    for col_name in yl_test.columns:
        length_comparison_cols[f'{col_name}_Actual'] = yl_test[col_name]
        length_comparison_cols[f'{col_name}_Predicted'] = yl_pred_rounded[col_name]

    lengths_comparison_df = pd.DataFrame(length_comparison_cols, index=X_test.index)
    ordered_length_cols = [col for pair in zip([f'{c}_Actual' for c in yl_test.columns], [f'{c}_Predicted' for c in yl_test.columns]) for col in pair]
    lengths_comparison_df = lengths_comparison_df[ordered_length_cols]

    return clf_model, reg_model, compounds_comparison_df, lengths_comparison_df


In [18]:
MAX_STINTS = 5 # Define this once

df_prepared = prepare_df(transformed_stints, max_stints=MAX_STINTS)

# Now, build_feature_matrix also takes max_stints
X_train, X_test, \
yc_train, yc_test, \
yl_train, yl_test, \
compound_encoders, feature_scaler = build_feature_matrix(df_prepared, max_stints=MAX_STINTS)

# Train and evaluate
clf_model, reg_model, compounds_comp_df, lengths_comp_df = train_and_eval(
    X_train, X_test, 
    yc_train, yc_test, 
    yl_train, yl_test,
    compound_encoders  # <-- Pass the encoders here
)

--- Model Performance Metrics ---


ValueError: multiclass-multioutput is not supported

In [None]:
def simplify_compound_columns(df: pd.DataFrame) -> pd.DataFrame:
    actual_cols = [col for col in df.columns if "Actual" in col]
    predicted_cols = [col for col in df.columns if "Predicted" in col]

    df["Compound_Actual"] = df[actual_cols].values.tolist()
    df["Compound_Predicted"] = df[predicted_cols].values.tolist()

    return df[["Compound_Actual", "Compound_Predicted"]]

simplified_df = simplify_compound_columns(compounds_comp_df)
simplified_df


Unnamed: 0,Compound_Actual,Compound_Predicted
383,"[SOFT, MEDIUM, MEDIUM, NONE, NONE]","[SOFT, SOFT, MEDIUM, NONE, NONE]"
738,"[MEDIUM, HARD, HARD, NONE, NONE]","[MEDIUM, HARD, NONE, NONE, NONE]"
1831,"[MEDIUM, HARD, HARD, NONE, NONE]","[MEDIUM, HARD, HARD, MEDIUM, NONE]"
1651,"[INTERMEDIATE, MEDIUM, MEDIUM, MEDIUM, NONE]","[INTERMEDIATE, MEDIUM, MEDIUM, MEDIUM, MEDIUM]"
2050,"[MEDIUM, MEDIUM, HARD, NONE, NONE]","[MEDIUM, HARD, NONE, NONE, NONE]"
...,...,...
945,"[SOFT, HARD, HARD, SOFT, NONE]","[SOFT, HARD, HARD, SOFT, NONE]"
2111,"[HARD, SOFT, NONE, NONE, NONE]","[MEDIUM, HARD, NONE, NONE, NONE]"
213,"[MEDIUM, HARD, NONE, NONE, NONE]","[MEDIUM, MEDIUM, HARD, NONE, NONE]"
845,"[SOFT, MEDIUM, NONE, NONE, NONE]","[MEDIUM, HARD, MEDIUM, SOFT, NONE]"


In [None]:
lengths_comp_df

Unnamed: 0,Length_1_Actual,Length_1_Predicted,Length_2_Actual,Length_2_Predicted,Length_3_Actual,Length_3_Predicted,Length_4_Actual,Length_4_Predicted,Length_5_Actual,Length_5_Predicted
383,21,25,21,28,23,10,0,0,0,0
738,12,13,14,27,20,9,0,0,0,0
1831,9,11,22,22,24,11,0,8,0,0
1651,26,24,6,6,1,2,30,11,0,13
2050,8,14,16,19,20,3,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
945,9,12,38,32,1,1,3,2,0,0
2111,58,38,19,35,0,0,0,0,0,0
213,36,13,35,40,0,16,0,0,0,0
845,27,28,26,25,0,15,0,18,0,0


In [None]:
def simplify_lengths_columns(df: pd.DataFrame) -> pd.DataFrame:
    actual_cols = [col for col in df.columns if "Actual" in col]
    predicted_cols = [col for col in df.columns if "Predicted" in col]
    
    # Sum the actual and predicted columns
    df["Length_Actual"] = df[actual_cols].sum(axis=1)
    df["Length_Predicted"] = df[predicted_cols].sum(axis=1)
    # Calculate the difference directly from the summed columns
    df["Sum_difference"] = df["Length_Actual"] - df["Length_Predicted"]
    
    return df[["Length_Actual", "Length_Predicted", "Sum_difference"]]

# simplified_lengths_df = simplify_lengths_columns(lengths_comp_df)
# simplified_lengths_df
lengths_comp_df.dtypes


Length_1_Actual       int64
Length_1_Predicted    int64
Length_2_Actual       int64
Length_2_Predicted    int64
Length_3_Actual       int64
Length_3_Predicted    int64
Length_4_Actual       int64
Length_4_Predicted    int64
Length_5_Actual       int64
Length_5_Predicted    int64
dtype: object