In [1]:
import ast
import datetime
import os
import random
import re
import warnings

import autogluon.tabular
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection

import dynmodel

pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 200

warnings.filterwarnings("ignore", message="Can't initialize NVML")

In [2]:
DIRECTORY_DATA = 'data/model-autogluon'
os.makedirs(DIRECTORY_DATA, exist_ok=True)

# Preparing data

In [12]:
# FILENAME_DF_ALL_CLEAN = None
#FILENAME_DF_ALL_CLEAN = 'data/20241230_173555/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250128_094430/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250220_094622_halfway/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250220_094622/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250223_150717_halfway/df_all_clean.csv'
FILENAME_DF_ALL_CLEAN = 'data/20250223_150717/df_all_clean.csv'

# FILENAME_DF_ALL_CLEAN_VEHICLES = None
FILENAME_DF_ALL_CLEAN_VEHICLES = 'data/20250223_150717/df_all_clean_vehicles.csv'

In [18]:
SEPARATOR_COL = ': '


def col2parts(col: str) -> tuple[str, str]:
    parts = col.split(SEPARATOR_COL)
    if len(parts) == 1:
        return '', parts[0]
    assert len(parts) == 2
    return parts[0], parts[1]


def load_df_all():
    if FILENAME_DF_ALL_CLEAN_VEHICLES is not None:
        print(FILENAME_DF_ALL_CLEAN_VEHICLES)
        df_all = pd.read_csv(FILENAME_DF_ALL_CLEAN_VEHICLES, header=list(range(2)), index_col=[0, 1])
        df_all.index.set_names(['Scenario ID', 'Vehicle'], inplace=True)
        df_all.columns = [SEPARATOR_COL.join(col) for col in df_all.columns]
        
        df_all = df_all[df_all.index.get_level_values('Scenario ID').str.contains('slowness no, forcing no')]  # baseline
        df_all = df_all[df_all.index.get_level_values('Vehicle') != 'V0']
    
    elif FILENAME_DF_ALL_CLEAN is not None:
        print(FILENAME_DF_ALL_CLEAN)
        df_all = pd.read_csv(FILENAME_DF_ALL_CLEAN, header=list(range(2)), index_col=0)
        df_all.index.set_names(['Scenario ID'], inplace=True)
        df_all.columns = [SEPARATOR_COL.join(col) for col in df_all.columns]
        
        df_all = df_all[df_all.index.get_level_values('Scenario ID').str.contains('slowness no, forcing no')]  # baseline
        
    else:
        runnames = [
            '20241203_170129_all600',
            '20241213_104400_racing',
            '20241214_122216_racing_passhum',
        ]
        df_all = pd.concat(
            [pd.read_csv(f'data/{runname}/df_all.csv') 
             for runname in runnames],
            keys=runnames,
            names=['runname', 'row']
        )
        assert all(SEPARATOR_COL not in col for col in df_all.columns), df_all.columns
            
    return df_all

df_all = load_df_all()
df_all.info()

data/20250223_150717/df_all_clean_vehicles.csv
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 600 entries, ('map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no', 'V1') to ('map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness no, forcing no', 'V3')
Columns: 123 entries, Violation type: Priority violation to Output of simulation (execution): Near-miss rate
dtypes: bool(4), float64(117), object(2)
memory usage: 649.3+ KB


In [39]:
list(df_all.columns)

['Violation type: Priority violation',
 'Violation type: Speed violation',
 'Violation type: Priority violation and Speed violation',
 'Coordination strategy: Probability of forcing',
 'Coordination strategy: Probability of stops on forcing',
 'Coordination strategy: Rerouting',
 'Static map features: ~ Map ID',
 'Static map features: No. of OPs',
 'Static map features: Connectivity',
 'Static map features: ~ Position',
 'Output of simulation (planning): POD score',
 'Output of simulation (planning): Mean POD score for AVs (V1-V3)',
 'Output of simulation (planning): POD C 0',
 'Output of simulation (planning): POD C 1',
 'Output of simulation (planning): POD C 2',
 'Output of simulation (planning): POD C 3',
 'Output of simulation (planning): POD C 4',
 'Output of simulation (planning): POD C 5',
 'Output of simulation (planning): POD C 6',
 'Output of simulation (planning): POD C 7',
 'Output of simulation (planning): POD C 8',
 'Output of simulation (planning): POD C 9',
 'Output of

In [43]:
ID_EXPERIMENT_TEXT = '100'

In [79]:
ID_EXPERIMENT_TEXT = f'{ID_EXPERIMENT_TEXT:03}'
ID_EXPERIMENT = tuple(int(c) for c in ID_EXPERIMENT_TEXT) 


def series2values(series):
    dtype = series.dtype
    if dtype == 'bool':
        return series.astype('int').values
    if dtype in ('int64', 'float64'):
        return series.values
    if dtype == 'object':
        return series.astype('category').cat.codes
    raise TypeError(f'{dtype} is not supported')
    

if FILENAME_DF_ALL_CLEAN_VEHICLES is not None or FILENAME_DF_ALL_CLEAN is not None:
    COLUMNS_NONSPLIT = {
        col: series2values
        for col in [
            'Static map features: ~ Position',  # includes i_map, i_position
        ]
    }
    COLUMNS_INPUT = {col: series2values 
                     # for col in (
                     #    'Static map features: Connectivity',
                     # )}
                     for col in df_all.columns 
                     if col2parts(col)[0] != 'Output of simulation (execution)'
                     and not col2parts(col)[1].startswith('~ ')}
    # COLUMNS_INPUT.pop('Static map features: No. of OPs')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for MV (V0)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for AV (V1)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for AV (V2)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for AV (V3)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Total Mission length for AVs (V1-V3)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Total Mission length for AVs (V1-V3)')  # TODO
    # COLUMNS_INPUT = {key: value for key, value in COLUMNS_INPUT.items()
    #                  if 'No. of OPs' not in key and 'POD' not in key}   
    # COLUMNS_INPUT = {key: value for key, value in COLUMNS_INPUT.items()
    #                  if 'POD C' not in key}      
    COLUMNS_INPUT = {
        col: value for col, value in COLUMNS_INPUT.items()
        if ID_EXPERIMENT[0] and re.search(r': POD score$' if FILENAME_DF_ALL_CLEAN_VEHICLES is not None else 'POD score for AV', col)
        or ID_EXPERIMENT[1] and re.search(r': POD C \d+$' if FILENAME_DF_ALL_CLEAN_VEHICLES is not None else ': POD C for AV', col)
        or ID_EXPERIMENT[2] and re.search(r': Mission length$' if FILENAME_DF_ALL_CLEAN_VEHICLES is not None else 'Mission length for AV', col)
    }
            
    COLUMNS_OUTPUT = {col: series2values
                      for col in [
                          (
                              'Output of simulation (execution): No. of completed missions'
                              if FILENAME_DF_ALL_CLEAN_VEHICLES is not None else
                              'Output of simulation (execution): Total No. of completed missions for AVs (V1-V3)'
                          ),
                        # 'Output of simulation (execution): Collision rate',
                      ]}
                      # for col in df_all.columns 
                      # if col2parts(col)[0] == 'Output of simulation (execution)'
                      # and not col2parts(col)[1].startswith('~ ')}
else:
    COLUMNS_NONSPLIT = {
        'Scenario ID': lambda series: series.values,
    }
    COLUMNS_INPUT = {
        'Coordination strategy': lambda series: series.astype('category').cat.codes.values,
        'isCanPassFirstHum': lambda series: series.astype('int').values,
        'isRacingThroughCrossroadAllowed': lambda series: series.astype('int').values,
        
        'Vehicle ID': lambda series: series.values,
        'Linearization C': lambda series: [
            series.apply(lambda x: -1 if x is None else x[i])
            for i in range(len(series.dropna().iloc[0]))
        ],   
    }
    COLUMNS_OUTPUT = {
        'traveled total, m': lambda series: series.values,
        'No. of completed missions': lambda series: series.values,
        'No. of collisions': lambda series: series.values,
        'No. of near-misses': lambda series: series.values,
    }

COLUMNS_ALL = {**COLUMNS_NONSPLIT, **COLUMNS_INPUT, **COLUMNS_OUTPUT}
COLUMNS_ALL

{'Static map features: ~ Position': <function __main__.series2values(series)>,
 'Output of simulation (planning): POD score': <function __main__.series2values(series)>,
 'Output of simulation (execution): No. of completed missions': <function __main__.series2values(series)>}

In [80]:
df_inout = df_all[list(COLUMNS_ALL)]
df_inout

Unnamed: 0_level_0,Unnamed: 1_level_0,Static map features: ~ Position,Output of simulation (planning): POD score,Output of simulation (execution): No. of completed missions
Scenario ID,Vehicle,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",V1,1-1,0.104,22.0
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",V2,1-1,0.058,8.0
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",V3,1-1,0.149,5.0
"map-generator/generated-maps/3_with_bridges/scenario1-10.json, passhum 0, slowness no, forcing no",V1,1-10,0.078,23.0
"map-generator/generated-maps/3_with_bridges/scenario1-10.json, passhum 0, slowness no, forcing no",V2,1-10,0.044,10.0
...,...,...,...,...
"map-generator/generated-maps/3_without_bridges/scenario9-8.json, passhum 0, slowness no, forcing no",V2,9-8,0.256,7.0
"map-generator/generated-maps/3_without_bridges/scenario9-8.json, passhum 0, slowness no, forcing no",V3,9-8,0.129,9.0
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness no, forcing no",V1,9-9,0.652,7.0
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness no, forcing no",V2,9-9,0.348,6.0


In [81]:
def parse_tuple_string(tuple_string):
    if pd.isna(tuple_string):
        return None
    return ast.literal_eval(tuple_string)


def preprocess_inout(df_inout):
    df_inout = df_inout.copy()
    
    linearization_columns = [col for col in df_inout.columns if isinstance(col, str) and col.startswith('Linearization')]
    for col in linearization_columns:
        df_inout[col] = df_inout[col].apply(parse_tuple_string)
        
    dict_preprocessed = {}
    for col, series2data in COLUMNS_ALL.items():
        data = series2data(df_inout[col])
        if not isinstance(data, list):
            dict_preprocessed[col] = data
        else:
            for i, series in enumerate(data):
                dict_preprocessed[f'{col}#{i}'] = series
    
    return pd.DataFrame(dict_preprocessed)
    

df_preprocessed = preprocess_inout(df_inout)
df_preprocessed

Unnamed: 0_level_0,Unnamed: 1_level_0,Static map features: ~ Position,Output of simulation (planning): POD score,Output of simulation (execution): No. of completed missions
Scenario ID,Vehicle,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",V1,0,0.104,22.0
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",V2,0,0.058,8.0
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",V3,0,0.149,5.0
"map-generator/generated-maps/3_with_bridges/scenario1-10.json, passhum 0, slowness no, forcing no",V1,1,0.078,23.0
"map-generator/generated-maps/3_with_bridges/scenario1-10.json, passhum 0, slowness no, forcing no",V2,1,0.044,10.0
...,...,...,...,...
"map-generator/generated-maps/3_without_bridges/scenario9-8.json, passhum 0, slowness no, forcing no",V2,98,0.256,7.0
"map-generator/generated-maps/3_without_bridges/scenario9-8.json, passhum 0, slowness no, forcing no",V3,98,0.129,9.0
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness no, forcing no",V1,99,0.652,7.0
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness no, forcing no",V2,99,0.348,6.0


In [82]:
def show(obj, title=None):
    if title is not None:
        display(HTML(f"<h3>{title}</h3>"))
    display(obj)


def shuffle_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    # Shuffle by Scenario ID
    unique_scenarios = df['Scenario ID'].unique()  # Get unique Scenario IDs
    shuffled_scenarios = pd.Series(unique_scenarios).sample(frac=1, random_state=1).tolist()  # Shuffle Scenario IDs
    
    # Reorder the dataframe by the shuffled Scenario IDs
    df_shuffled = pd.concat([df[df['Scenario ID'] == scenario] for scenario in shuffled_scenarios])
    
    # Reset index (optional)
    df_shuffled = df_shuffled.reset_index(drop=True)
    
    return df_shuffled
    """
    return df.sample(frac=1, random_state=1)


def split_train_test(df):
    test_size = 0.2
    
    if not COLUMNS_NONSPLIT:
        return sklearn.model_selection.train_test_split(df, test_size=test_size, random_state=1)
    
    gss = sklearn.model_selection.GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=1)
    column, = list(COLUMNS_NONSPLIT)
    groups = df[column]
    
    # Split the data
    for train_idx, test_idx in gss.split(df, groups=groups):
        return shuffle_df(df.iloc[train_idx]), shuffle_df(df.iloc[test_idx])


df_train, df_test = split_train_test(df_preprocessed)
show(df_train, 'df_train')
show(df_test, 'df_test')
if 'Static map features: ~ Position' in df_train.columns:
    assert not set(df_train['Static map features: ~ Position']) & set(df_test['Static map features: ~ Position']) 

Unnamed: 0_level_0,Unnamed: 1_level_0,Static map features: ~ Position,Output of simulation (planning): POD score,Output of simulation (execution): No. of completed missions
Scenario ID,Vehicle,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"map-generator/generated-maps/3_without_bridges/scenario10-9.json, passhum 0, slowness no, forcing no",V3,19,0.143,6.0
"map-generator/generated-maps/3_without_bridges/scenario10-5.json, passhum 0, slowness no, forcing no",V2,15,0.160,14.0
"map-generator/generated-maps/3_with_bridges/scenario10-2.json, passhum 0, slowness no, forcing no",V2,12,0.195,8.0
"map-generator/generated-maps/3_without_bridges/scenario5-7.json, passhum 0, slowness no, forcing no",V3,57,0.748,1.0
"map-generator/generated-maps/3_without_bridges/scenario7-6.json, passhum 0, slowness no, forcing no",V2,76,0.911,2.0
...,...,...,...,...
"map-generator/generated-maps/3_without_bridges/scenario1-5.json, passhum 0, slowness no, forcing no",V1,5,0.207,18.0
"map-generator/generated-maps/3_with_bridges/scenario2-6.json, passhum 0, slowness no, forcing no",V1,26,0.062,10.0
"map-generator/generated-maps/3_without_bridges/scenario6-3.json, passhum 0, slowness no, forcing no",V1,63,0.175,11.0
"map-generator/generated-maps/3_with_bridges/scenario9-8.json, passhum 0, slowness no, forcing no",V2,98,0.260,7.0


Unnamed: 0_level_0,Unnamed: 1_level_0,Static map features: ~ Position,Output of simulation (planning): POD score,Output of simulation (execution): No. of completed missions
Scenario ID,Vehicle,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"map-generator/generated-maps/3_without_bridges/scenario6-5.json, passhum 0, slowness no, forcing no",V3,65,0.042,8.0
"map-generator/generated-maps/3_with_bridges/scenario9-2.json, passhum 0, slowness no, forcing no",V1,92,0.652,0.0
"map-generator/generated-maps/3_with_bridges/scenario9-3.json, passhum 0, slowness no, forcing no",V3,93,0.038,8.0
"map-generator/generated-maps/3_without_bridges/scenario9-3.json, passhum 0, slowness no, forcing no",V1,93,0.653,5.0
"map-generator/generated-maps/3_without_bridges/scenario3-6.json, passhum 0, slowness no, forcing no",V3,36,0.037,8.0
...,...,...,...,...
"map-generator/generated-maps/3_with_bridges/scenario3-2.json, passhum 0, slowness no, forcing no",V1,32,0.705,7.0
"map-generator/generated-maps/3_without_bridges/scenario3-3.json, passhum 0, slowness no, forcing no",V1,33,0.790,4.0
"map-generator/generated-maps/3_with_bridges/scenario3-3.json, passhum 0, slowness no, forcing no",V1,33,0.505,5.0
"map-generator/generated-maps/3_without_bridges/scenario8-10.json, passhum 0, slowness no, forcing no",V3,81,0.209,5.0


In [88]:
df_train.sort_values(by='Scenario ID')

Unnamed: 0_level_0,Unnamed: 1_level_0,Static map features: ~ Position,Output of simulation (planning): POD score,Output of simulation (execution): No. of completed missions
Scenario ID,Vehicle,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",V1,0,0.104,22.0
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",V2,0,0.058,8.0
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",V3,0,0.149,5.0
"map-generator/generated-maps/3_with_bridges/scenario1-10.json, passhum 0, slowness no, forcing no",V1,1,0.078,23.0
"map-generator/generated-maps/3_with_bridges/scenario1-10.json, passhum 0, slowness no, forcing no",V3,1,0.025,8.0
...,...,...,...,...
"map-generator/generated-maps/3_without_bridges/scenario9-8.json, passhum 0, slowness no, forcing no",V1,98,0.756,5.0
"map-generator/generated-maps/3_without_bridges/scenario9-8.json, passhum 0, slowness no, forcing no",V3,98,0.129,9.0
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness no, forcing no",V3,99,0.033,8.0
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness no, forcing no",V1,99,0.652,7.0


# Logistic regression (as a baseline)

In [83]:
def split_df_to_X_y(df):
    columns_input_df = [col for col in df.columns
                        if (col if not isinstance(col, str) else col.split('#')[0]) in COLUMNS_INPUT]
    columns_output_df = list(COLUMNS_OUTPUT)
    assert set(COLUMNS_NONSPLIT) | set(columns_input_df) | set(columns_output_df) == set(df.columns)
    
    X = df[columns_input_df]
    y = df[columns_output_df]    
    return X, y


def run_regression(df_train, df_test):
    X_train, y_train = split_df_to_X_y(df_train)
    X_test, y_test = split_df_to_X_y(df_test)
    
    model = sklearn.linear_model.LinearRegression()
    model.fit(X_train, y_train)
    ndarray_predictions = model.predict(X_test)
    df_predictions = pd.DataFrame(ndarray_predictions, columns=y_test.columns)
    return df_predictions


df_predictions_regression = run_regression(df_train, df_test)
df_predictions_regression

Unnamed: 0,Output of simulation (execution): No. of completed missions
0,8.564000
1,5.356301
2,8.585034
3,5.351042
4,8.590292
...,...
115,5.077599
116,4.630625
117,6.129304
118,7.685826


In [84]:
def save_and_show(fig, basename):  # to avoid inlining large image data into the notebook file
    filename = f'{DIRECTORY_DATA}/{basename}-{random.random()}.png'
    fig.savefig(filename)
    
    # The `random` is because of https://stackoverflow.com/a/43640705.
    display(HTML(f'<img src="{filename}?{random.random()}" alt="{basename}" title="{datetime.datetime.now()}" />'))
    
    plt.close(fig)
    
    return filename


def evaluate_and_plot_column(df_test, df_predictions, column, *, is_plot=True):
    y_test_column = df_test[column]
    predictions_column = df_predictions[column]
    
    r2 = sklearn.metrics.r2_score(y_test_column, predictions_column)
    name = col2parts(column)[1]
    print(f"{name}:")
    print(f"- R^2 Score: {r2}")
    
    if not is_plot:
        return

    # Plot results for each output column
    fig = plt.figure(figsize=(10, 6))
    plt.scatter(y_test_column, predictions_column, color='blue', alpha=0.5)
    plt.plot([y_test_column.min(), y_test_column.max()], [y_test_column.min(), y_test_column.max()], 'k--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'Actual vs Predicted Values for {name}')
    plt.grid(True)
    save_and_show(fig, f'Actual_vs_Predicted_Values_{name}')


def evaluate_and_plot_all_columns(df_test, df_predictions):
    for column in COLUMNS_OUTPUT:
        evaluate_and_plot_column(df_test, df_predictions, column, is_plot=False)
        
        
evaluate_and_plot_all_columns(df_test, df_predictions_regression)

No. of completed missions:
- R^2 Score: 0.15051795646163457


# AutoGluon

In [85]:
def run_autogluon(df_train, df_test):
    X_train, y_train = split_df_to_X_y(df_train)
    X_test, y_test = split_df_to_X_y(df_test)
    
    # Train AutoGluon models
    predictors = []
    df_predictions = pd.DataFrame()
    for column in COLUMNS_OUTPUT:
        print(f'{column=}:')
        df_train_predictor = pd.concat([X_train, y_train[[column]]], axis=1)
        preset = 'medium'  # medium (~1 min.), good (~15 min.), high (~2 h)
        
        predictor = autogluon.tabular.TabularPredictor(
            label=column, 
            eval_metric='r2', 
            problem_type='regression',
        ).fit(
            df_train_predictor,
            presets=preset,
            hyperparameters={
                'GBM': {},       # LightGBM (TODO: something like `GBMLarge`)
                'XGB': {},       # XGBoost
                'RF': {},        # Random Forest
                'XT': {},        # Extra Trees
                # 'CAT': {},      # CatBoost, omitted if slow
                # 'NN': {},       # Neural net, if you want it
                # 'LR': {},       # Linear model
                # 'KNN': {},      # K-Nearest Neighbors
            },
        )
        predictors.append(predictor)
        
        df_predictions[column] = predictor.predict(X_test)

        # Leaderboard - Display a table of different models and their performance
        df_test_predictor = pd.concat([X_test, y_test[[column]]], axis=1)
        leaderboard = predictor.leaderboard(df_test_predictor, silent=True)
        dynmodel.process_leaderboard(leaderboard, os.path.dirname(FILENAME_DF_ALL_CLEAN), column, f'{preset} {ID_EXPERIMENT_TEXT}')
        
        # Feature importance on training data
        # show(
        #     predictor.feature_importance(df_train_predictor),
        #     'feature_importance(df_train_predictor)'
        # )
        # 
        # # Feature importance on test data
        # show(
        #     predictor.feature_importance(df_test_predictor),
        #     'feature_importance(df_test_predictor)'
        # )
        # Example: SHAP values for a specific model
        # shap_values = predictor.get_model_shap_values(df_test_predictor, model='LightGBM')
        # show(shap_values, 'shap_values')  # SHAP values for each feature and each prediction
        
    return predictors, df_predictions


predictors, df_predictions_autogluon = run_autogluon(df_train, df_test)
df_predictions_autogluon

No path specified. Models will be saved in: "AutogluonModels/ag-20250226_121246"
Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #141~20.04.1-Ubuntu SMP Thu Jan 16 18:38:51 UTC 2025
CPU Count:          16
Memory Avail:       5.19 GB / 31.09 GB (16.7%)
Disk Space Avail:   209.27 GB / 693.60 GB (30.2%)
Presets specified: ['medium']
Beginning AutoGluon training ...
AutoGluon will save models to "/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250226_121246"
Train Data Rows:    480
Train Data Columns: 1
Label Column:       Output of simulation (execution): No. of completed missions
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5314.00 MB
	Train Data (Original)  Memory U

column='Output of simulation (execution): No. of completed missions':


	0.3116	 = Validation score   (r2)
	2.82s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForest ...
	0.2349	 = Validation score   (r2)
	0.96s	 = Training   runtime
	0.1s	 = Validation runtime
Fitting model: ExtraTrees ...
	0.3024	 = Validation score   (r2)
	1.11s	 = Training   runtime
	0.11s	 = Validation runtime
Fitting model: XGBoost ...
	0.3555	 = Validation score   (r2)
	1.59s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'XGBoost': 0.609, 'ExtraTrees': 0.217, 'LightGBM': 0.174}
	0.3615	 = Validation score   (r2)
	0.11s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 7.04s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 834.9 rows/s (96 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250226_121246")


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
1,WeightedEnsemble_L2,-0.216,0.361,r2,0.169,0.115,5.629,0.006,0.002,0.106,2,True,5
2,XGBoost,-0.256,0.355,r2,0.024,0.005,1.588,0.024,0.005,1.588,1,True,4
0,LightGBM,0.03,0.312,r2,0.002,0.001,2.824,0.002,0.001,2.824,1,True,1
3,ExtraTrees,-0.45,0.302,r2,0.136,0.106,1.11,0.136,0.106,1.11,1,True,3
4,RandomForest,-0.534,0.235,r2,0.149,0.098,0.955,0.149,0.098,0.955,1,True,2


Unnamed: 0_level_0,Unnamed: 1_level_0,Output of simulation (execution): No. of completed missions
Scenario ID,Vehicle,Unnamed: 2_level_1
"map-generator/generated-maps/3_without_bridges/scenario6-5.json, passhum 0, slowness no, forcing no",V3,8.383085
"map-generator/generated-maps/3_with_bridges/scenario9-2.json, passhum 0, slowness no, forcing no",V1,2.717607
"map-generator/generated-maps/3_with_bridges/scenario9-3.json, passhum 0, slowness no, forcing no",V3,8.455364
"map-generator/generated-maps/3_without_bridges/scenario9-3.json, passhum 0, slowness no, forcing no",V1,2.729346
"map-generator/generated-maps/3_without_bridges/scenario3-6.json, passhum 0, slowness no, forcing no",V3,8.444225
...,...,...
"map-generator/generated-maps/3_with_bridges/scenario3-2.json, passhum 0, slowness no, forcing no",V1,6.368776
"map-generator/generated-maps/3_without_bridges/scenario3-3.json, passhum 0, slowness no, forcing no",V1,4.368273
"map-generator/generated-maps/3_with_bridges/scenario3-3.json, passhum 0, slowness no, forcing no",V1,5.189765
"map-generator/generated-maps/3_without_bridges/scenario8-10.json, passhum 0, slowness no, forcing no",V3,14.996902


## evaluate_and_plot_all_columns

In [86]:
evaluate_and_plot_all_columns(df_test, df_predictions_autogluon)

No. of completed missions:
- R^2 Score: -0.21615311248434121


## explain_predictions

In [87]:
def explain_predictions(predictors):
    # X_test, y_test = split_df_to_X_y(df_test)
    
    for column, predictor in zip(COLUMNS_OUTPUT, predictors):
        # df_test_predictor = pd.concat([X_test, y_test[[column]]], axis=1)
        if column not in (
            'Output of simulation (execution): Total No. of completed missions for AVs (V1-V3)',
            'Output of simulation (execution): Collision rate',
        ):
            continue
        
        for model in 'LightGBM', 'XGBoost':
            tree_model = predictor._trainer.load_model(model)
            if model == 'LightGBM':
                tree_importance = tree_model.model.feature_importance(importance_type='gain')
            elif model == 'XGBoost':
                tree_importance = tree_model.model.feature_importances_
            else:
                raise ValueError(model)
            # show(tree_importance, column)
                
            
            feature_names = predictor.feature_metadata.get_features()
            df = pd.DataFrame.from_dict(
                {name: {'importance': value} for name, value in zip(feature_names, tree_importance)},
                orient='index'
            )
            # print(df.index)
            # Group indexes by their base name before `#`
            df['group'] = df.index.str.extract(r'^(.+?)(?:#\d+)?$', expand=False)
            df = df.groupby('group')['importance'].sum().to_frame()
            
            df.sort_values(by='importance', ascending=False, inplace=True)
            show(df, f'{model}: {column}')
        
        
explain_predictions(predictors)