In [1]:
import ast
import datetime
import os
import random
import warnings

import autogluon.tabular
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection

pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 200

warnings.filterwarnings("ignore", message="Can't initialize NVML")

In [2]:
DIRECTORY_DATA = 'data/model-autogluon'
os.makedirs(DIRECTORY_DATA, exist_ok=True)

# Preparing data

In [3]:
# FILENAME_DF_ALL_CLEAN = None
#FILENAME_DF_ALL_CLEAN = 'data/20241230_173555/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250128_094430/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250220_094622_halfway/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250220_094622/df_all_clean.csv'
FILENAME_DF_ALL_CLEAN = 'data/20250223_150717_halfway/df_all_clean.csv'

In [4]:
SEPARATOR_COL = ': '


def col2parts(col: str) -> tuple[str, str]:
    parts = col.split(SEPARATOR_COL)
    if len(parts) == 1:
        return '', parts[0]
    assert len(parts) == 2
    return parts[0], parts[1]


if FILENAME_DF_ALL_CLEAN is not None:
    df_all = pd.read_csv(FILENAME_DF_ALL_CLEAN, header=list(range(2)), index_col=0)
    df_all.columns = [SEPARATOR_COL.join(col) for col in df_all.columns]
else:
    runnames = [
        '20241203_170129_all600',
        '20241213_104400_racing',
        '20241214_122216_racing_passhum',
    ]
    df_all = pd.concat(
        [pd.read_csv(f'data/{runname}/df_all.csv') 
         for runname in runnames],
        keys=runnames,
        names=['runname', 'row']
    )
    assert all(SEPARATOR_COL not in col for col in df_all.columns), df_all.columns

print(FILENAME_DF_ALL_CLEAN)
df_all.info()

data/20250223_150717_halfway/df_all_clean.csv
<class 'pandas.core.frame.DataFrame'>
Index: 1733 entries, map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing change of priorities to map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing stops 50% with forcing 50%
Columns: 438 entries, Violation type: Priority violation to Output of simulation (execution): Near-miss rate
dtypes: bool(6), float64(408), int64(22), object(2)
memory usage: 5.7+ MB


In [5]:
# df_all = df_all[~df_all.index.str.contains('50% stops')]
# df_all.info()

In [6]:
def series2values(series):
    dtype = series.dtype
    if dtype == 'bool':
        return series.astype('int').values
    if dtype in ('int64', 'float64'):
        return series.values
    if dtype == 'object':
        return series.astype('category').cat.codes
    raise TypeError(f'{dtype} is not supported')
    

if FILENAME_DF_ALL_CLEAN is not None:
    COLUMNS_NONSPLIT = {
        col: series2values
        for col in [
            'Static map features: ~ Position',  # includes i_map, i_position
        ]
    }
    COLUMNS_INPUT = {col: series2values 
                     # for col in (
                     #    'Static map features: Connectivity',
                     # )}
                     for col in df_all.columns 
                     if col2parts(col)[0] != 'Output of simulation (execution)'
                     and not col2parts(col)[1].startswith('~ ')}
    # COLUMNS_INPUT.pop('Static map features: No. of OPs')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for MV (V0)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for AV (V1)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for AV (V2)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for AV (V3)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Total Mission length for AVs (V1-V3)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Total Mission length for AVs (V1-V3)')  # TODO
    # COLUMNS_INPUT = {key: value for key, value in COLUMNS_INPUT.items()
    #                  if 'No. of OPs' not in key and 'POD' not in key}   
    COLUMNS_INPUT = {key: value for key, value in COLUMNS_INPUT.items()
                     if 'POD C' not in key}   
    COLUMNS_OUTPUT = {col: series2values
                      for col in (
                        'Output of simulation (execution): Total No. of completed missions for AVs (V1-V3)',
                        'Output of simulation (execution): Collision rate'
                      )}
                      # for col in df_all.columns 
                      # if col2parts(col)[0] == 'Output of simulation (execution)'
                      # and not col2parts(col)[1].startswith('~ ')}
else:
    COLUMNS_NONSPLIT = {
        'Scenario ID': lambda series: series.values,
    }
    COLUMNS_INPUT = {
        'Coordination strategy': lambda series: series.astype('category').cat.codes.values,
        'isCanPassFirstHum': lambda series: series.astype('int').values,
        'isRacingThroughCrossroadAllowed': lambda series: series.astype('int').values,
        
        'Vehicle ID': lambda series: series.values,
        'Linearization C': lambda series: [
            series.apply(lambda x: -1 if x is None else x[i])
            for i in range(len(series.dropna().iloc[0]))
        ],   
    }
    COLUMNS_OUTPUT = {
        'traveled total, m': lambda series: series.values,
        'No. of completed missions': lambda series: series.values,
        'No. of collisions': lambda series: series.values,
        'No. of near-misses': lambda series: series.values,
    }

COLUMNS_ALL = {**COLUMNS_NONSPLIT, **COLUMNS_INPUT, **COLUMNS_OUTPUT}
COLUMNS_ALL

{'Static map features: ~ Position': <function __main__.series2values(series)>,
 'Violation type: Priority violation': <function __main__.series2values(series)>,
 'Violation type: Speed violation': <function __main__.series2values(series)>,
 'Violation type: Priority violation and Speed violation': <function __main__.series2values(series)>,
 'Coordination strategy: Change of priorities': <function __main__.series2values(series)>,
 'Coordination strategy: Stops': <function __main__.series2values(series)>,
 'Coordination strategy: Rerouting': <function __main__.series2values(series)>,
 'Static map features: No. of OPs': <function __main__.series2values(series)>,
 'Static map features: Connectivity': <function __main__.series2values(series)>,
 'Output of simulation (planning): POD score for MV (V0)': <function __main__.series2values(series)>,
 'Output of simulation (planning): POD score for AV (V1)': <function __main__.series2values(series)>,
 'Output of simulation (planning): POD score fo

In [7]:
df_inout = df_all[list(COLUMNS_ALL)]
df_inout

Unnamed: 0,Static map features: ~ Position,Violation type: Priority violation,Violation type: Speed violation,Violation type: Priority violation and Speed violation,Coordination strategy: Change of priorities,Coordination strategy: Stops,Coordination strategy: Rerouting,Static map features: No. of OPs,Static map features: Connectivity,Output of simulation (planning): POD score for MV (V0),Output of simulation (planning): POD score for AV (V1),Output of simulation (planning): POD score for AV (V2),Output of simulation (planning): POD score for AV (V3),Output of simulation (planning): Mean POD score for AVs (V1-V3),Output of simulation (planning): Mission length for MV (V0),Output of simulation (planning): Mission length for AV (V1),Output of simulation (planning): Mission length for AV (V2),Output of simulation (planning): Mission length for AV (V3),Output of simulation (planning): Total Mission length for AVs (V1-V3),Output of simulation (planning): Mean Mission length for AVs (V1-V3),Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing change of priorities",1-1,True,False,False,True,False,False,2,high,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,32,0.333
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing ignoring human",1-1,True,False,False,False,False,False,2,high,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,36,0.318
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",1-1,False,False,False,False,False,False,2,high,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,35,0.000
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing stops",1-1,True,False,False,False,True,False,2,high,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,31,0.312
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing stops 50%",1-1,True,False,False,False,False,False,2,high,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,31,0.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing ignoring human",9-9,False,False,True,False,False,False,1,low,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,21,2.000
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing no",9-9,False,True,False,False,False,False,1,low,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,17,0.000
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing stops",9-9,False,False,True,False,True,False,1,low,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,11,0.500
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing stops 50%",9-9,False,False,True,False,False,False,1,low,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,12,0.667


In [8]:
def parse_tuple_string(tuple_string):
    if pd.isna(tuple_string):
        return None
    return ast.literal_eval(tuple_string)


def preprocess_inout(df_inout):
    df_inout = df_inout.copy()
    
    linearization_columns = [col for col in df_inout.columns if isinstance(col, str) and col.startswith('Linearization')]
    for col in linearization_columns:
        df_inout[col] = df_inout[col].apply(parse_tuple_string)
        
    dict_preprocessed = {}
    for col, series2data in COLUMNS_ALL.items():
        data = series2data(df_inout[col])
        if not isinstance(data, list):
            dict_preprocessed[col] = data
        else:
            for i, series in enumerate(data):
                dict_preprocessed[f'{col}#{i}'] = series
    
    return pd.DataFrame(dict_preprocessed)
    

df_preprocessed = preprocess_inout(df_inout)
df_preprocessed

Unnamed: 0,Static map features: ~ Position,Violation type: Priority violation,Violation type: Speed violation,Violation type: Priority violation and Speed violation,Coordination strategy: Change of priorities,Coordination strategy: Stops,Coordination strategy: Rerouting,Static map features: No. of OPs,Static map features: Connectivity,Output of simulation (planning): POD score for MV (V0),Output of simulation (planning): POD score for AV (V1),Output of simulation (planning): POD score for AV (V2),Output of simulation (planning): POD score for AV (V3),Output of simulation (planning): Mean POD score for AVs (V1-V3),Output of simulation (planning): Mission length for MV (V0),Output of simulation (planning): Mission length for AV (V1),Output of simulation (planning): Mission length for AV (V2),Output of simulation (planning): Mission length for AV (V3),Output of simulation (planning): Total Mission length for AVs (V1-V3),Output of simulation (planning): Mean Mission length for AVs (V1-V3),Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing change of priorities",0,1,0,0,1,0,0,2,0,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,32,0.333
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing ignoring human",0,1,0,0,0,0,0,2,0,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,36,0.318
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",0,0,0,0,0,0,0,2,0,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,35,0.000
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing stops",0,1,0,0,0,1,0,2,0,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,31,0.312
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing stops 50%",0,1,0,0,0,0,0,2,0,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,31,0.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing ignoring human",62,0,0,1,0,0,0,1,1,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,21,2.000
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing no",62,0,1,0,0,0,0,1,1,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,17,0.000
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing stops",62,0,0,1,0,1,0,1,1,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,11,0.500
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing stops 50%",62,0,0,1,0,0,0,1,1,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,12,0.667


In [9]:
def show(obj, title=None):
    if title is not None:
        display(HTML(f"<h3>{title}</h3>"))
    display(obj)


def shuffle_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    # Shuffle by Scenario ID
    unique_scenarios = df['Scenario ID'].unique()  # Get unique Scenario IDs
    shuffled_scenarios = pd.Series(unique_scenarios).sample(frac=1, random_state=1).tolist()  # Shuffle Scenario IDs
    
    # Reorder the dataframe by the shuffled Scenario IDs
    df_shuffled = pd.concat([df[df['Scenario ID'] == scenario] for scenario in shuffled_scenarios])
    
    # Reset index (optional)
    df_shuffled = df_shuffled.reset_index(drop=True)
    
    return df_shuffled
    """
    return df.sample(frac=1, random_state=1)


def split_train_test(df):
    test_size = 0.2
    
    if not COLUMNS_NONSPLIT:
        return sklearn.model_selection.train_test_split(df, test_size=test_size, random_state=1)
    
    gss = sklearn.model_selection.GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=1)
    column, = list(COLUMNS_NONSPLIT)
    groups = df[column]
    
    # Split the data
    for train_idx, test_idx in gss.split(df, groups=groups):
        return shuffle_df(df.iloc[train_idx]), shuffle_df(df.iloc[test_idx])


df_train, df_test = split_train_test(df_preprocessed)
show(df_train, 'df_train')
show(df_test, 'df_test')
if 'Static map features: ~ Position' in df_train.columns:
    assert not set(df_train['Static map features: ~ Position']) & set(df_test['Static map features: ~ Position']) 

Unnamed: 0,Static map features: ~ Position,Violation type: Priority violation,Violation type: Speed violation,Violation type: Priority violation and Speed violation,Coordination strategy: Change of priorities,Coordination strategy: Stops,Coordination strategy: Rerouting,Static map features: No. of OPs,Static map features: Connectivity,Output of simulation (planning): POD score for MV (V0),Output of simulation (planning): POD score for AV (V1),Output of simulation (planning): POD score for AV (V2),Output of simulation (planning): POD score for AV (V3),Output of simulation (planning): Mean POD score for AVs (V1-V3),Output of simulation (planning): Mission length for MV (V0),Output of simulation (planning): Mission length for AV (V1),Output of simulation (planning): Mission length for AV (V2),Output of simulation (planning): Mission length for AV (V3),Output of simulation (planning): Total Mission length for AVs (V1-V3),Output of simulation (planning): Mean Mission length for AVs (V1-V3),Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
"map-generator/generated-maps/3_without_bridges/scenario3-1.json, passhum 0, slowness no, forcing change of priorities",17,1,0,0,1,0,0,1,1,0.770,0.997,0.829,0.077,0.692,706,908,612,588,2108,702.667,20,0.545
"map-generator/generated-maps/3_without_bridges/scenario3-1.json, passhum 0, slowness no, forcing no",17,0,0,0,0,0,0,1,1,0.770,0.997,0.829,0.077,0.692,706,908,612,588,2108,702.667,21,0.000
"map-generator/generated-maps/3_without_bridges/scenario3-6.json, passhum 0, slowness without rerouting, forcing no",23,0,1,0,0,0,0,1,1,0.531,0.624,0.304,0.037,0.372,706,908,657,550,2115,705.000,19,0.000
"map-generator/generated-maps/3_without_bridges/scenario3-8.json, passhum 0, slowness without rerouting, forcing stops 50%",25,0,0,1,0,0,0,1,1,0.753,0.846,0.618,0.172,0.523,706,908,540,978,2426,808.667,11,0.333
"map-generator/generated-maps/3_with_bridges/scenario1-4.json, passhum 0, slowness with rerouting, forcing stops 50%",4,0,0,1,0,0,1,2,0,0.244,0.055,0.039,0.165,0.106,380,443,535,1017,1995,665.000,19,0.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"map-generator/generated-maps/3_with_bridges/scenario9-3.json, passhum 0, slowness with rerouting, forcing change of priorities",56,0,0,1,1,0,1,1,0,0.645,0.656,0.033,0.038,0.379,891,1360,525,572,2457,819.000,14,0.000
"map-generator/generated-maps/3_without_bridges/scenario2-1.json, passhum 0, slowness no, forcing change of priorities",10,1,0,0,1,0,0,1,1,0.164,0.190,0.048,0.067,0.110,479,638,521,452,1611,537.000,23,0.286
"map-generator/generated-maps/3_without_bridges/scenario4-2.json, passhum 0, slowness without rerouting, forcing stops 50% with forcing 50%",28,0,0,1,0,0,0,1,1,0.478,0.129,0.379,0.437,0.347,510,587,971,1076,2634,878.000,11,0.250
"map-generator/generated-maps/3_with_bridges/scenario2-5.json, passhum 0, slowness without rerouting, forcing stops",14,0,0,1,0,1,0,1,0,0.278,0.107,0.056,0.230,0.158,425,556,431,1010,1997,665.667,16,0.167


Unnamed: 0,Static map features: ~ Position,Violation type: Priority violation,Violation type: Speed violation,Violation type: Priority violation and Speed violation,Coordination strategy: Change of priorities,Coordination strategy: Stops,Coordination strategy: Rerouting,Static map features: No. of OPs,Static map features: Connectivity,Output of simulation (planning): POD score for MV (V0),Output of simulation (planning): POD score for AV (V1),Output of simulation (planning): POD score for AV (V2),Output of simulation (planning): POD score for AV (V3),Output of simulation (planning): Mean POD score for AVs (V1-V3),Output of simulation (planning): Mission length for MV (V0),Output of simulation (planning): Mission length for AV (V1),Output of simulation (planning): Mission length for AV (V2),Output of simulation (planning): Mission length for AV (V3),Output of simulation (planning): Total Mission length for AVs (V1-V3),Output of simulation (planning): Mean Mission length for AVs (V1-V3),Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
"map-generator/generated-maps/3_with_bridges/scenario7-1.json, passhum 0, slowness without rerouting, forcing stops 50%",42,0,0,1,0,0,0,1,0,0.570,0.024,0.483,0.683,0.496,455,492,821,1302,2615,871.667,10,1.000
"map-generator/generated-maps/3_with_bridges/scenario7-9.json, passhum 0, slowness no, forcing change of priorities",51,1,0,0,1,0,0,1,0,0.515,0.069,0.381,0.637,0.438,455,537,1076,1302,2915,971.667,16,0.520
"map-generator/generated-maps/3_with_bridges/scenario5-6.json, passhum 0, slowness with rerouting, forcing change of priorities",38,0,0,1,1,0,1,1,0,0.729,0.043,0.069,0.838,0.543,626,354,531,1453,2338,779.333,17,0.250
"map-generator/generated-maps/3_without_bridges/scenario7-1.json, passhum 0, slowness no, forcing change of priorities",42,1,0,0,1,0,0,1,1,0.743,0.078,0.641,0.836,0.634,454,509,900,1373,2782,927.333,16,0.263
"map-generator/generated-maps/3_without_bridges/scenario5-9.json, passhum 0, slowness without rerouting, forcing stops 50% with forcing 50%",41,0,0,1,0,0,0,1,1,0.875,0.195,0.433,0.991,0.655,627,608,931,1449,2988,996.000,13,0.333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"map-generator/generated-maps/3_without_bridges/scenario1-2.json, passhum 0, slowness no, forcing change of priorities",2,1,0,0,1,0,0,2,1,0.151,0.158,0.063,0.005,0.058,754,260,523,533,1316,438.667,36,0.250
"map-generator/generated-maps/3_without_bridges/scenario5-3.json, passhum 0, slowness no, forcing stops 50%",35,1,0,0,0,0,0,1,1,0.042,0.055,0.098,0.124,0.098,627,354,506,607,1467,489.000,25,0.455
"map-generator/generated-maps/3_with_bridges/scenario5-3.json, passhum 0, slowness no, forcing stops 50%",35,1,0,0,0,0,0,1,0,0.011,0.661,0.774,0.063,0.530,626,586,700,531,1817,605.667,21,0.143
"map-generator/generated-maps/3_without_bridges/scenario3-7.json, passhum 0, slowness without rerouting, forcing no",24,0,1,0,0,0,0,1,1,0.935,1.340,1.149,1.187,1.226,706,908,1023,612,2543,847.667,12,0.000


# Logistic regression (as a baseline)

In [10]:
def split_df_to_X_y(df):
    columns_input_df = [col for col in df.columns
                        if (col if not isinstance(col, str) else col.split('#')[0]) in COLUMNS_INPUT]
    columns_output_df = list(COLUMNS_OUTPUT)
    assert set(COLUMNS_NONSPLIT) | set(columns_input_df) | set(columns_output_df) == set(df.columns)
    
    X = df[columns_input_df]
    y = df[columns_output_df]    
    return X, y


def run_regression(df_train, df_test):
    X_train, y_train = split_df_to_X_y(df_train)
    X_test, y_test = split_df_to_X_y(df_test)
    
    model = sklearn.linear_model.LinearRegression()
    model.fit(X_train, y_train)
    ndarray_predictions = model.predict(X_test)
    df_predictions = pd.DataFrame(ndarray_predictions, columns=y_test.columns)
    return df_predictions


df_predictions_regression = run_regression(df_train, df_test)
df_predictions_regression

Unnamed: 0,Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
0,14.468898,0.865981
1,14.974296,0.300123
2,14.959468,0.334636
3,12.882458,0.477166
4,11.407137,1.156926
...,...,...
348,34.324273,0.118958
349,28.019767,0.463639
350,24.274538,0.289288
351,15.385102,0.496161


In [11]:
def save_and_show(fig, basename):  # to avoid inlining large image data into the notebook file
    filename = f'{DIRECTORY_DATA}/{basename}-{random.random()}.png'
    fig.savefig(filename)
    
    # The `random` is because of https://stackoverflow.com/a/43640705.
    display(HTML(f'<img src="{filename}?{random.random()}" alt="{basename}" title="{datetime.datetime.now()}" />'))
    
    plt.close(fig)
    
    return filename


def evaluate_and_plot_column(df_test, df_predictions, column):
    y_test_column = df_test[column]
    predictions_column = df_predictions[column]
    
    r2 = sklearn.metrics.r2_score(y_test_column, predictions_column)
    name = col2parts(column)[1]
    print(f"{name}:")
    print(f"- R^2 Score: {r2}")

    # Plot results for each output column
    fig = plt.figure(figsize=(10, 6))
    plt.scatter(y_test_column, predictions_column, color='blue', alpha=0.5)
    plt.plot([y_test_column.min(), y_test_column.max()], [y_test_column.min(), y_test_column.max()], 'k--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'Actual vs Predicted Values for {name}')
    plt.grid(True)
    save_and_show(fig, f'Actual_vs_Predicted_Values_{name}')


def evaluate_and_plot_all_columns(df_test, df_predictions):
    for column in COLUMNS_OUTPUT:
        evaluate_and_plot_column(df_test, df_predictions, column)
        
        
evaluate_and_plot_all_columns(df_test, df_predictions_regression)

Total No. of completed missions for AVs (V1-V3):
- R^2 Score: 0.5728686074820595


Collision rate:
- R^2 Score: 0.2437010600260331


# AutoGluon

In [16]:
def run_autogluon(df_train, df_test):
    X_train, y_train = split_df_to_X_y(df_train)
    X_test, y_test = split_df_to_X_y(df_test)
    
    # Train AutoGluon models
    predictors = []
    df_predictions = pd.DataFrame()
    for column in COLUMNS_OUTPUT:
        print(f'{column=}:')
        df_train_predictor = pd.concat([X_train, y_train[[column]]], axis=1)
        
        predictor = autogluon.tabular.TabularPredictor(
            label=column, 
            eval_metric='r2', 
            problem_type='regression',
        ).fit(
            df_train_predictor,
            presets='medium',  # medium (~1 min.), good (~15 min.), high (~2 h)
            hyperparameters={
                'GBM': {},       # LightGBM (TODO: something like `GBMLarge`)
                'XGB': {},       # XGBoost
                'RF': {},        # Random Forest
                'XT': {},        # Extra Trees
                # 'CAT': {},      # CatBoost, omitted if slow
                # 'NN': {},       # Neural net, if you want it
                # 'LR': {},       # Linear model
                # 'KNN': {},      # K-Nearest Neighbors
            },
        )
        predictors.append(predictor)
        
        df_predictions[column] = predictor.predict(X_test)

        # Leaderboard - Display a table of different models and their performance
        df_test_predictor = pd.concat([X_test, y_test[[column]]], axis=1)
        leaderboard = predictor.leaderboard(df_test_predictor, silent=True)
        leaderboard = leaderboard.sort_values(by="score_val", ascending=False)
        show(leaderboard, f'Leaderboard for {column}')
        
        # Feature importance on training data
        # show(
        #     predictor.feature_importance(df_train_predictor),
        #     'feature_importance(df_train_predictor)'
        # )
        # 
        # # Feature importance on test data
        # show(
        #     predictor.feature_importance(df_test_predictor),
        #     'feature_importance(df_test_predictor)'
        # )
        # Example: SHAP values for a specific model
        # shap_values = predictor.get_model_shap_values(df_test_predictor, model='LightGBM')
        # show(shap_values, 'shap_values')  # SHAP values for each feature and each prediction
        
    return predictors, df_predictions


predictors, df_predictions_autogluon = run_autogluon(df_train, df_test)
df_predictions_autogluon

No path specified. Models will be saved in: "AutogluonModels/ag-20250224_102048"
Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #141~20.04.1-Ubuntu SMP Thu Jan 16 18:38:51 UTC 2025
CPU Count:          16
Memory Avail:       2.46 GB / 31.09 GB (7.9%)
Disk Space Avail:   222.58 GB / 693.60 GB (32.1%)
Presets specified: ['medium']
Beginning AutoGluon training ...
AutoGluon will save models to "/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250224_102048"
Train Data Rows:    1380
Train Data Columns: 19
Label Column:       Output of simulation (execution): Total No. of completed missions for AVs (V1-V3)
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2519.91 MB
	Train Da

column='Output of simulation (execution): Total No. of completed missions for AVs (V1-V3)':


	0.8309	 = Validation score   (r2)
	1.78s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForest ...
	0.8055	 = Validation score   (r2)
	1.61s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: ExtraTrees ...
	0.8057	 = Validation score   (r2)
	0.87s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: XGBoost ...
	0.8385	 = Validation score   (r2)
	1.19s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'XGBoost': 0.917, 'LightGBM': 0.083}
	0.8385	 = Validation score   (r2)
	0.12s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 6.06s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 28608.1 rows/s (276 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250224_102048")


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
3,WeightedEnsemble_L2,0.509815,0.838506,r2,0.037381,0.009648,3.08454,0.003758,0.002002,0.115752,2,True,5
4,XGBoost,0.493668,0.838457,r2,0.023731,0.005043,1.185129,0.023731,0.005043,1.185129,1,True,4
2,LightGBM,0.606271,0.830914,r2,0.009892,0.002603,1.783659,0.009892,0.002603,1.783659,1,True,1
0,ExtraTrees,0.659531,0.805711,r2,0.119452,0.067788,0.86903,0.119452,0.067788,0.86903,1,True,3
1,RandomForest,0.606487,0.805469,r2,0.166607,0.087835,1.605533,0.166607,0.087835,1.605533,1,True,2


No path specified. Models will be saved in: "AutogluonModels/ag-20250224_102055"
Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #141~20.04.1-Ubuntu SMP Thu Jan 16 18:38:51 UTC 2025
CPU Count:          16
Memory Avail:       2.44 GB / 31.09 GB (7.9%)
Disk Space Avail:   222.54 GB / 693.60 GB (32.1%)
Presets specified: ['medium']
Beginning AutoGluon training ...
AutoGluon will save models to "/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250224_102055"
Train Data Rows:    1380
Train Data Columns: 19
Label Column:       Output of simulation (execution): Collision rate
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2499.32 MB
	Train Data (Original)  Memory Usage: 0.19

column='Output of simulation (execution): Collision rate':


	0.1375	 = Validation score   (r2)
	3.16s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForest ...
	-0.0543	 = Validation score   (r2)
	1.39s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: ExtraTrees ...
	-0.0455	 = Validation score   (r2)
	1.29s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: XGBoost ...
	0.169	 = Validation score   (r2)
	1.49s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'XGBoost': 1.0}
	0.169	 = Validation score   (r2)
	0.13s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 7.95s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 35061.6 rows/s (276 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250224_102055")


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
3,XGBoost,0.260203,0.168989,r2,0.029401,0.006299,1.493655,0.029401,0.006299,1.493655,1,True,4
4,WeightedEnsemble_L2,0.260203,0.168989,r2,0.032343,0.007872,1.621386,0.002942,0.001573,0.12773,2,True,5
2,LightGBM,0.290151,0.137468,r2,0.009381,0.00134,3.163147,0.009381,0.00134,3.163147,1,True,1
0,ExtraTrees,0.31617,-0.045535,r2,0.139879,0.087547,1.286079,0.139879,0.087547,1.286079,1,True,3
1,RandomForest,0.297557,-0.054266,r2,0.201707,0.087936,1.385014,0.201707,0.087936,1.385014,1,True,2


Unnamed: 0,Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
"map-generator/generated-maps/3_with_bridges/scenario7-1.json, passhum 0, slowness without rerouting, forcing stops 50%",15.690575,0.644654
"map-generator/generated-maps/3_with_bridges/scenario7-9.json, passhum 0, slowness no, forcing change of priorities",14.889368,0.443364
"map-generator/generated-maps/3_with_bridges/scenario5-6.json, passhum 0, slowness with rerouting, forcing change of priorities",17.791101,0.584382
"map-generator/generated-maps/3_without_bridges/scenario7-1.json, passhum 0, slowness no, forcing change of priorities",15.577618,0.547927
"map-generator/generated-maps/3_without_bridges/scenario5-9.json, passhum 0, slowness without rerouting, forcing stops 50% with forcing 50%",12.507632,1.123047
...,...,...
"map-generator/generated-maps/3_without_bridges/scenario1-2.json, passhum 0, slowness no, forcing change of priorities",41.034592,0.208075
"map-generator/generated-maps/3_without_bridges/scenario5-3.json, passhum 0, slowness no, forcing stops 50%",29.879717,0.304194
"map-generator/generated-maps/3_with_bridges/scenario5-3.json, passhum 0, slowness no, forcing stops 50%",20.489149,0.385849
"map-generator/generated-maps/3_without_bridges/scenario3-7.json, passhum 0, slowness without rerouting, forcing no",16.140797,0.069769


## evaluate_and_plot_all_columns

In [13]:
evaluate_and_plot_all_columns(df_test, df_predictions_autogluon)

Total No. of completed missions for AVs (V1-V3):
- R^2 Score: 0.509814977645874


Collision rate:
- R^2 Score: 0.2602025140794544


## explain_predictions

In [14]:
def explain_predictions(predictors):
    # X_test, y_test = split_df_to_X_y(df_test)
    
    for column, predictor in zip(COLUMNS_OUTPUT, predictors):
        # df_test_predictor = pd.concat([X_test, y_test[[column]]], axis=1)
        if column not in (
            'Output of simulation (execution): Total No. of completed missions for AVs (V1-V3)',
            'Output of simulation (execution): Collision rate',
        ):
            continue
        
        for model in 'LightGBM', 'XGBoost':
            tree_model = predictor._trainer.load_model(model)
            if model == 'LightGBM':
                tree_importance = tree_model.model.feature_importance(importance_type='gain')
            elif model == 'XGBoost':
                tree_importance = tree_model.model.feature_importances_
            else:
                raise ValueError(model)
            # show(tree_importance, column)
                
            
            feature_names = predictor.feature_metadata.get_features()
            df = pd.DataFrame.from_dict(
                {name: {'importance': value} for name, value in zip(feature_names, tree_importance)},
                orient='index'
            )
            # print(df.index)
            # Group indexes by their base name before `#`
            df['group'] = df.index.str.extract(r'^(.+?)(?:#\d+)?$', expand=False)
            df = df.groupby('group')['importance'].sum().to_frame()
            
            df.sort_values(by='importance', ascending=False, inplace=True)
            show(df, f'{model}: {column}')
        
        
explain_predictions(predictors)

Unnamed: 0_level_0,importance
group,Unnamed: 1_level_1
Output of simulation (planning): Total Mission length for AVs (V1-V3),306969.69001
Output of simulation (planning): Mission length for AV (V1),163450.296439
Output of simulation (planning): Mission length for MV (V0),66354.399702
Output of simulation (planning): POD score for AV (V3),19657.928445
Coordination strategy: Stops,18080.628994
Output of simulation (planning): POD score for AV (V1),17665.392795
Violation type: Priority violation and Speed violation,13867.033366
Output of simulation (planning): Mission length for AV (V3),13103.372194
Output of simulation (planning): Mission length for AV (V2),9191.64781
Output of simulation (planning): POD score for MV (V0),8608.064239


Unnamed: 0_level_0,importance
group,Unnamed: 1_level_1
Output of simulation (planning): Total Mission length for AVs (V1-V3),0.490346
Output of simulation (planning): Mission length for AV (V1),0.244702
Output of simulation (planning): Mission length for MV (V0),0.130978
Output of simulation (planning): POD score for AV (V3),0.015557
Static map features: No. of OPs,0.015465
Output of simulation (planning): Mean POD score for AVs (V1-V3),0.015386
Output of simulation (planning): Mission length for AV (V3),0.014884
Output of simulation (planning): POD score for AV (V1),0.013385
Violation type: Priority violation and Speed violation,0.010381
Coordination strategy: Stops,0.008849


Unnamed: 0_level_0,importance
group,Unnamed: 1_level_1
Output of simulation (planning): POD score for MV (V0),351.272452
Violation type: Speed violation,289.704714
Violation type: Priority violation,187.246248
Coordination strategy: Rerouting,145.563947
Violation type: Priority violation and Speed violation,139.128687
Output of simulation (planning): Mission length for AV (V2),108.751707
Coordination strategy: Change of priorities,105.610147
Coordination strategy: Stops,104.224404
Output of simulation (planning): POD score for AV (V2),86.690973
Output of simulation (planning): Total Mission length for AVs (V1-V3),45.662314


Unnamed: 0_level_0,importance
group,Unnamed: 1_level_1
Violation type: Speed violation,0.278002
Violation type: Priority violation and Speed violation,0.087282
Output of simulation (planning): POD score for MV (V0),0.086112
Output of simulation (planning): Mission length for AV (V2),0.082604
Violation type: Priority violation,0.066791
Coordination strategy: Rerouting,0.055312
Coordination strategy: Change of priorities,0.042674
Output of simulation (planning): POD score for AV (V2),0.042038
Output of simulation (planning): Mission length for MV (V0),0.03547
Coordination strategy: Stops,0.034041
