In [1]:
import ast
import datetime
import os
import random
import warnings

import autogluon.tabular
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection

import dynmodel

pd.options.display.max_columns = 100
pd.options.display.max_colwidth = 200

warnings.filterwarnings("ignore", message="Can't initialize NVML")

In [2]:
DIRECTORY_DATA = 'data/model-autogluon'
os.makedirs(DIRECTORY_DATA, exist_ok=True)

# Preparing data

In [3]:
# FILENAME_DF_ALL_CLEAN = None
#FILENAME_DF_ALL_CLEAN = 'data/20241230_173555/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250128_094430/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250220_094622_halfway/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250220_094622/df_all_clean.csv'
# FILENAME_DF_ALL_CLEAN = 'data/20250223_150717_halfway/df_all_clean.csv'
FILENAME_DF_ALL_CLEAN = 'data/20250223_150717/df_all_clean.csv'

In [4]:
SEPARATOR_COL = ': '


def col2parts(col: str) -> tuple[str, str]:
    parts = col.split(SEPARATOR_COL)
    if len(parts) == 1:
        return '', parts[0]
    assert len(parts) == 2
    return parts[0], parts[1]


if FILENAME_DF_ALL_CLEAN is not None:
    df_all = pd.read_csv(FILENAME_DF_ALL_CLEAN, header=list(range(2)), index_col=0)
    df_all.columns = [SEPARATOR_COL.join(col) for col in df_all.columns]
else:
    runnames = [
        '20241203_170129_all600',
        '20241213_104400_racing',
        '20241214_122216_racing_passhum',
    ]
    df_all = pd.concat(
        [pd.read_csv(f'data/{runname}/df_all.csv') 
         for runname in runnames],
        keys=runnames,
        names=['runname', 'row']
    )
    assert all(SEPARATOR_COL not in col for col in df_all.columns), df_all.columns

print(FILENAME_DF_ALL_CLEAN)
df_all.info()

data/20250223_150717/df_all_clean.csv
<class 'pandas.core.frame.DataFrame'>
Index: 2886 entries, map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing change of priorities to map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing stops 50% with forcing 50%
Columns: 438 entries, Violation type: Priority violation to Output of simulation (execution): Near-miss rate
dtypes: bool(6), float64(408), int64(22), object(2)
memory usage: 9.6+ MB


In [5]:
# df_all = df_all[~df_all.index.str.contains('50% stops')]
# df_all.info()

In [6]:
def series2values(series):
    dtype = series.dtype
    if dtype == 'bool':
        return series.astype('int').values
    if dtype in ('int64', 'float64'):
        return series.values
    if dtype == 'object':
        return series.astype('category').cat.codes
    raise TypeError(f'{dtype} is not supported')
    

if FILENAME_DF_ALL_CLEAN is not None:
    COLUMNS_NONSPLIT = {
        col: series2values
        for col in [
            'Static map features: ~ Position',  # includes i_map, i_position
        ]
    }
    COLUMNS_INPUT = {col: series2values 
                     # for col in (
                     #    'Static map features: Connectivity',
                     # )}
                     for col in df_all.columns 
                     if col2parts(col)[0] != 'Output of simulation (execution)'
                     and not col2parts(col)[1].startswith('~ ')}
    # COLUMNS_INPUT.pop('Static map features: No. of OPs')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for MV (V0)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for AV (V1)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for AV (V2)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Mission length for AV (V3)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Total Mission length for AVs (V1-V3)')  # TODO
    # COLUMNS_INPUT.pop('Output of simulation (planning): Total Mission length for AVs (V1-V3)')  # TODO
    # COLUMNS_INPUT = {key: value for key, value in COLUMNS_INPUT.items()
    #                  if 'No. of OPs' not in key and 'POD' not in key}   
    COLUMNS_INPUT = {key: value for key, value in COLUMNS_INPUT.items()
                     if 'POD C' not in key}   
    COLUMNS_OUTPUT = {col: series2values
                      for col in (
                        'Output of simulation (execution): Total No. of completed missions for AVs (V1-V3)',
                        'Output of simulation (execution): Collision rate'
                      )}
                      # for col in df_all.columns 
                      # if col2parts(col)[0] == 'Output of simulation (execution)'
                      # and not col2parts(col)[1].startswith('~ ')}
else:
    COLUMNS_NONSPLIT = {
        'Scenario ID': lambda series: series.values,
    }
    COLUMNS_INPUT = {
        'Coordination strategy': lambda series: series.astype('category').cat.codes.values,
        'isCanPassFirstHum': lambda series: series.astype('int').values,
        'isRacingThroughCrossroadAllowed': lambda series: series.astype('int').values,
        
        'Vehicle ID': lambda series: series.values,
        'Linearization C': lambda series: [
            series.apply(lambda x: -1 if x is None else x[i])
            for i in range(len(series.dropna().iloc[0]))
        ],   
    }
    COLUMNS_OUTPUT = {
        'traveled total, m': lambda series: series.values,
        'No. of completed missions': lambda series: series.values,
        'No. of collisions': lambda series: series.values,
        'No. of near-misses': lambda series: series.values,
    }

COLUMNS_ALL = {**COLUMNS_NONSPLIT, **COLUMNS_INPUT, **COLUMNS_OUTPUT}
COLUMNS_ALL

{'Static map features: ~ Position': <function __main__.series2values(series)>,
 'Violation type: Priority violation': <function __main__.series2values(series)>,
 'Violation type: Speed violation': <function __main__.series2values(series)>,
 'Violation type: Priority violation and Speed violation': <function __main__.series2values(series)>,
 'Coordination strategy: Change of priorities': <function __main__.series2values(series)>,
 'Coordination strategy: Stops': <function __main__.series2values(series)>,
 'Coordination strategy: Rerouting': <function __main__.series2values(series)>,
 'Static map features: No. of OPs': <function __main__.series2values(series)>,
 'Static map features: Connectivity': <function __main__.series2values(series)>,
 'Output of simulation (planning): POD score for MV (V0)': <function __main__.series2values(series)>,
 'Output of simulation (planning): POD score for AV (V1)': <function __main__.series2values(series)>,
 'Output of simulation (planning): POD score fo

In [7]:
df_inout = df_all[list(COLUMNS_ALL)]
df_inout

Unnamed: 0,Static map features: ~ Position,Violation type: Priority violation,Violation type: Speed violation,Violation type: Priority violation and Speed violation,Coordination strategy: Change of priorities,Coordination strategy: Stops,Coordination strategy: Rerouting,Static map features: No. of OPs,Static map features: Connectivity,Output of simulation (planning): POD score for MV (V0),Output of simulation (planning): POD score for AV (V1),Output of simulation (planning): POD score for AV (V2),Output of simulation (planning): POD score for AV (V3),Output of simulation (planning): Mean POD score for AVs (V1-V3),Output of simulation (planning): Mission length for MV (V0),Output of simulation (planning): Mission length for AV (V1),Output of simulation (planning): Mission length for AV (V2),Output of simulation (planning): Mission length for AV (V3),Output of simulation (planning): Total Mission length for AVs (V1-V3),Output of simulation (planning): Mean Mission length for AVs (V1-V3),Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing change of priorities",1-1,True,False,False,True,False,False,2,high,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,32,0.333
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing ignoring human",1-1,True,False,False,False,False,False,2,high,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,36,0.318
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",1-1,False,False,False,False,False,False,2,high,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,35,0.000
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing stops",1-1,True,False,False,False,True,False,2,high,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,31,0.312
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing stops 50%",1-1,True,False,False,False,False,False,2,high,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,31,0.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing ignoring human",9-9,False,False,True,False,False,False,1,low,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,21,2.000
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing no",9-9,False,True,False,False,False,False,1,low,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,17,0.000
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing stops",9-9,False,False,True,False,True,False,1,low,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,11,0.500
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing stops 50%",9-9,False,False,True,False,False,False,1,low,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,12,0.667


In [8]:
def parse_tuple_string(tuple_string):
    if pd.isna(tuple_string):
        return None
    return ast.literal_eval(tuple_string)


def preprocess_inout(df_inout):
    df_inout = df_inout.copy()
    
    linearization_columns = [col for col in df_inout.columns if isinstance(col, str) and col.startswith('Linearization')]
    for col in linearization_columns:
        df_inout[col] = df_inout[col].apply(parse_tuple_string)
        
    dict_preprocessed = {}
    for col, series2data in COLUMNS_ALL.items():
        data = series2data(df_inout[col])
        if not isinstance(data, list):
            dict_preprocessed[col] = data
        else:
            for i, series in enumerate(data):
                dict_preprocessed[f'{col}#{i}'] = series
    
    return pd.DataFrame(dict_preprocessed)
    

df_preprocessed = preprocess_inout(df_inout)
df_preprocessed

Unnamed: 0,Static map features: ~ Position,Violation type: Priority violation,Violation type: Speed violation,Violation type: Priority violation and Speed violation,Coordination strategy: Change of priorities,Coordination strategy: Stops,Coordination strategy: Rerouting,Static map features: No. of OPs,Static map features: Connectivity,Output of simulation (planning): POD score for MV (V0),Output of simulation (planning): POD score for AV (V1),Output of simulation (planning): POD score for AV (V2),Output of simulation (planning): POD score for AV (V3),Output of simulation (planning): Mean POD score for AVs (V1-V3),Output of simulation (planning): Mission length for MV (V0),Output of simulation (planning): Mission length for AV (V1),Output of simulation (planning): Mission length for AV (V2),Output of simulation (planning): Mission length for AV (V3),Output of simulation (planning): Total Mission length for AVs (V1-V3),Output of simulation (planning): Mean Mission length for AVs (V1-V3),Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing change of priorities",0,1,0,0,1,0,0,2,0,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,32,0.333
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing ignoring human",0,1,0,0,0,0,0,2,0,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,36,0.318
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing no",0,0,0,0,0,0,0,2,0,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,35,0.000
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing stops",0,1,0,0,0,1,0,2,0,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,31,0.312
"map-generator/generated-maps/3_with_bridges/scenario1-1.json, passhum 0, slowness no, forcing stops 50%",0,1,0,0,0,0,0,2,0,0.195,0.104,0.058,0.149,0.115,721,227,547,1017,1791,597.000,31,0.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing ignoring human",99,0,0,1,0,0,0,1,1,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,21,2.000
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing no",99,0,1,0,0,0,0,1,1,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,17,0.000
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing stops",99,0,0,1,0,1,0,1,1,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,11,0.500
"map-generator/generated-maps/3_without_bridges/scenario9-9.json, passhum 0, slowness without rerouting, forcing stops 50%",99,0,0,1,0,0,0,1,1,0.670,0.652,0.348,0.033,0.406,744,1057,808,572,2437,812.333,12,0.667


In [9]:
def show(obj, title=None):
    if title is not None:
        display(HTML(f"<h3>{title}</h3>"))
    display(obj)


def shuffle_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    # Shuffle by Scenario ID
    unique_scenarios = df['Scenario ID'].unique()  # Get unique Scenario IDs
    shuffled_scenarios = pd.Series(unique_scenarios).sample(frac=1, random_state=1).tolist()  # Shuffle Scenario IDs
    
    # Reorder the dataframe by the shuffled Scenario IDs
    df_shuffled = pd.concat([df[df['Scenario ID'] == scenario] for scenario in shuffled_scenarios])
    
    # Reset index (optional)
    df_shuffled = df_shuffled.reset_index(drop=True)
    
    return df_shuffled
    """
    return df.sample(frac=1, random_state=1)


def split_train_test(df):
    test_size = 0.2
    
    if not COLUMNS_NONSPLIT:
        return sklearn.model_selection.train_test_split(df, test_size=test_size, random_state=1)
    
    gss = sklearn.model_selection.GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=1)
    column, = list(COLUMNS_NONSPLIT)
    groups = df[column]
    
    # Split the data
    for train_idx, test_idx in gss.split(df, groups=groups):
        return shuffle_df(df.iloc[train_idx]), shuffle_df(df.iloc[test_idx])


df_train, df_test = split_train_test(df_preprocessed)
show(df_train, 'df_train')
show(df_test, 'df_test')
if 'Static map features: ~ Position' in df_train.columns:
    assert not set(df_train['Static map features: ~ Position']) & set(df_test['Static map features: ~ Position']) 

Unnamed: 0,Static map features: ~ Position,Violation type: Priority violation,Violation type: Speed violation,Violation type: Priority violation and Speed violation,Coordination strategy: Change of priorities,Coordination strategy: Stops,Coordination strategy: Rerouting,Static map features: No. of OPs,Static map features: Connectivity,Output of simulation (planning): POD score for MV (V0),Output of simulation (planning): POD score for AV (V1),Output of simulation (planning): POD score for AV (V2),Output of simulation (planning): POD score for AV (V3),Output of simulation (planning): Mean POD score for AVs (V1-V3),Output of simulation (planning): Mission length for MV (V0),Output of simulation (planning): Mission length for AV (V1),Output of simulation (planning): Mission length for AV (V2),Output of simulation (planning): Mission length for AV (V3),Output of simulation (planning): Total Mission length for AVs (V1-V3),Output of simulation (planning): Mean Mission length for AVs (V1-V3),Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
"map-generator/generated-maps/3_without_bridges/scenario10-4.json, passhum 0, slowness no, forcing no",14,0,0,0,0,0,0,2,1,0.294,0.066,0.103,0.283,0.194,714,402,607,1180,2189,729.667,26,0.000
"map-generator/generated-maps/3_without_bridges/scenario2-6.json, passhum 0, slowness without rerouting, forcing change of priorities",26,0,0,1,1,0,0,1,1,0.618,0.102,0.411,0.469,0.381,328,452,985,1086,2523,841.000,15,0.167
"map-generator/generated-maps/3_with_bridges/scenario4-9.json, passhum 0, slowness no, forcing stops 50% with forcing 50%",49,1,0,0,0,0,0,1,0,0.332,0.193,0.035,0.253,0.187,560,711,513,1108,2332,777.333,7,0.000
"map-generator/generated-maps/3_with_bridges/scenario1-6.json, passhum 0, slowness with rerouting, forcing stops 50%",6,0,0,1,0,0,1,2,0,0.032,0.020,0.037,0.022,0.026,659,522,523,585,1630,543.333,22,0.000
"map-generator/generated-maps/3_with_bridges/scenario7-10.json, passhum 0, slowness without rerouting, forcing stops",71,0,0,1,0,1,0,1,0,0.361,0.819,0.854,0.508,0.689,455,791,808,1302,2901,967.000,3,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"map-generator/generated-maps/3_with_bridges/scenario6-8.json, passhum 0, slowness without rerouting, forcing stops",68,0,0,1,0,1,0,2,0,0.254,0.132,0.172,0.010,0.117,698,450,877,517,1844,614.667,20,0.167
"map-generator/generated-maps/3_with_bridges/scenario6-4.json, passhum 0, slowness with rerouting, forcing stops 50% with forcing 50%",64,0,0,1,0,0,1,2,0,0.181,0.028,0.178,0.010,0.099,565,328,877,517,1722,574.000,25,0.000
"map-generator/generated-maps/3_with_bridges/scenario7-7.json, passhum 0, slowness without rerouting, forcing stops",77,0,0,1,0,1,0,1,0,0.529,0.072,0.408,0.649,0.455,455,537,1000,1302,2839,946.333,11,0.333
"map-generator/generated-maps/3_with_bridges/scenario10-4.json, passhum 0, slowness without rerouting, forcing stops",14,0,0,1,0,1,0,2,0,0.137,0.023,0.025,0.149,0.093,709,321,506,1018,1845,615.000,23,0.400


Unnamed: 0,Static map features: ~ Position,Violation type: Priority violation,Violation type: Speed violation,Violation type: Priority violation and Speed violation,Coordination strategy: Change of priorities,Coordination strategy: Stops,Coordination strategy: Rerouting,Static map features: No. of OPs,Static map features: Connectivity,Output of simulation (planning): POD score for MV (V0),Output of simulation (planning): POD score for AV (V1),Output of simulation (planning): POD score for AV (V2),Output of simulation (planning): POD score for AV (V3),Output of simulation (planning): Mean POD score for AVs (V1-V3),Output of simulation (planning): Mission length for MV (V0),Output of simulation (planning): Mission length for AV (V1),Output of simulation (planning): Mission length for AV (V2),Output of simulation (planning): Mission length for AV (V3),Output of simulation (planning): Total Mission length for AVs (V1-V3),Output of simulation (planning): Mean Mission length for AVs (V1-V3),Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
"map-generator/generated-maps/3_with_bridges/scenario8-10.json, passhum 0, slowness with rerouting, forcing stops",81,0,0,1,0,1,1,1,0,0.589,0.405,0.066,0.195,0.233,812,711,487,1063,2261,753.667,13,0.750
"map-generator/generated-maps/3_without_bridges/scenario8-4.json, passhum 0, slowness without rerouting, forcing ignoring human",84,0,0,1,0,0,0,1,1,0.411,0.253,0.056,0.250,0.213,756,846,485,1184,2515,838.333,21,1.500
"map-generator/generated-maps/3_with_bridges/scenario8-10.json, passhum 0, slowness without rerouting, forcing stops",81,0,0,1,0,1,0,1,0,0.589,0.405,0.066,0.195,0.233,812,711,487,1063,2261,753.667,13,0.750
"map-generator/generated-maps/3_without_bridges/scenario3-10.json, passhum 0, slowness without rerouting, forcing stops 50% with forcing 50%",31,0,0,1,0,0,0,1,1,0.610,0.485,0.051,0.237,0.285,706,908,550,1085,2543,847.667,15,0.333
"map-generator/generated-maps/3_without_bridges/scenario3-9.json, passhum 0, slowness without rerouting, forcing ignoring human",39,0,0,1,0,0,0,1,1,0.330,0.086,0.038,0.225,0.145,417,552,512,1085,2149,716.333,23,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"map-generator/generated-maps/3_with_bridges/scenario4-4.json, passhum 0, slowness without rerouting, forcing change of priorities",44,0,0,1,1,0,0,1,0,0.296,0.188,0.065,0.216,0.170,560,711,561,996,2268,756.000,15,0.500
"map-generator/generated-maps/3_with_bridges/scenario5-10.json, passhum 0, slowness with rerouting, forcing stops 50% with forcing 50%",51,0,0,1,0,0,1,1,0,0.852,0.073,0.382,0.963,0.613,626,531,955,1453,2939,979.667,5,0.000
"map-generator/generated-maps/3_with_bridges/scenario3-3.json, passhum 0, slowness no, forcing stops 50% with forcing 50%",33,1,0,0,0,0,0,1,0,0.368,0.505,0.506,0.333,0.458,570,955,486,553,1994,664.667,20,1.000
"map-generator/generated-maps/3_with_bridges/scenario7-8.json, passhum 0, slowness without rerouting, forcing stops 50% with forcing 50%",78,0,0,1,0,0,0,1,0,0.796,0.917,0.923,0.875,0.901,455,871,900,1302,3073,1024.333,5,0.600


# Logistic regression (as a baseline)

In [10]:
def split_df_to_X_y(df):
    columns_input_df = [col for col in df.columns
                        if (col if not isinstance(col, str) else col.split('#')[0]) in COLUMNS_INPUT]
    columns_output_df = list(COLUMNS_OUTPUT)
    assert set(COLUMNS_NONSPLIT) | set(columns_input_df) | set(columns_output_df) == set(df.columns)
    
    X = df[columns_input_df]
    y = df[columns_output_df]    
    return X, y


def run_regression(df_train, df_test):
    X_train, y_train = split_df_to_X_y(df_train)
    X_test, y_test = split_df_to_X_y(df_test)
    
    model = sklearn.linear_model.LinearRegression()
    model.fit(X_train, y_train)
    ndarray_predictions = model.predict(X_test)
    df_predictions = pd.DataFrame(ndarray_predictions, columns=y_test.columns)
    return df_predictions


df_predictions_regression = run_regression(df_train, df_test)
df_predictions_regression

Unnamed: 0,Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
0,15.370481,0.371699
1,16.688643,0.795117
2,16.120481,0.641230
3,14.662047,0.804095
4,17.108100,0.706827
...,...,...
572,17.831023,0.314386
573,13.417226,0.708664
574,19.291232,0.566192
575,7.286849,0.921654


In [11]:
def save_and_show(fig, basename):  # to avoid inlining large image data into the notebook file
    filename = f'{DIRECTORY_DATA}/{basename}-{random.random()}.png'
    fig.savefig(filename)
    
    # The `random` is because of https://stackoverflow.com/a/43640705.
    display(HTML(f'<img src="{filename}?{random.random()}" alt="{basename}" title="{datetime.datetime.now()}" />'))
    
    plt.close(fig)
    
    return filename


def evaluate_and_plot_column(df_test, df_predictions, column, *, is_plot=True):
    y_test_column = df_test[column]
    predictions_column = df_predictions[column]
    
    r2 = sklearn.metrics.r2_score(y_test_column, predictions_column)
    name = col2parts(column)[1]
    print(f"{name}:")
    print(f"- R^2 Score: {r2}")
    
    if not is_plot:
        return

    # Plot results for each output column
    fig = plt.figure(figsize=(10, 6))
    plt.scatter(y_test_column, predictions_column, color='blue', alpha=0.5)
    plt.plot([y_test_column.min(), y_test_column.max()], [y_test_column.min(), y_test_column.max()], 'k--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'Actual vs Predicted Values for {name}')
    plt.grid(True)
    save_and_show(fig, f'Actual_vs_Predicted_Values_{name}')


def evaluate_and_plot_all_columns(df_test, df_predictions):
    for column in COLUMNS_OUTPUT:
        evaluate_and_plot_column(df_test, df_predictions, column, is_plot=False)
        
        
evaluate_and_plot_all_columns(df_test, df_predictions_regression)

Total No. of completed missions for AVs (V1-V3):
- R^2 Score: 0.5769145011354597
Collision rate:
- R^2 Score: 0.23176187829461603


# AutoGluon

In [12]:
def run_autogluon(df_train, df_test):
    X_train, y_train = split_df_to_X_y(df_train)
    X_test, y_test = split_df_to_X_y(df_test)
    
    # Train AutoGluon models
    predictors = []
    df_predictions = pd.DataFrame()
    for column in COLUMNS_OUTPUT:
        print(f'{column=}:')
        df_train_predictor = pd.concat([X_train, y_train[[column]]], axis=1)
        preset = 'good'  # medium (~1 min.), good (~15 min.), high (~2 h)
        
        predictor = autogluon.tabular.TabularPredictor(
            label=column, 
            eval_metric='r2', 
            problem_type='regression',
        ).fit(
            df_train_predictor,
            presets=preset,
            hyperparameters={
                'GBM': {},       # LightGBM (TODO: something like `GBMLarge`)
                'XGB': {},       # XGBoost
                'RF': {},        # Random Forest
                'XT': {},        # Extra Trees
                # 'CAT': {},      # CatBoost, omitted if slow
                # 'NN': {},       # Neural net, if you want it
                # 'LR': {},       # Linear model
                # 'KNN': {},      # K-Nearest Neighbors
            },
        )
        predictors.append(predictor)
        
        df_predictions[column] = predictor.predict(X_test)

        # Leaderboard - Display a table of different models and their performance
        df_test_predictor = pd.concat([X_test, y_test[[column]]], axis=1)
        leaderboard = predictor.leaderboard(df_test_predictor, silent=True)
        dynmodel.process_leaderboard(leaderboard, os.path.dirname(FILENAME_DF_ALL_CLEAN), column, preset)
        
        # Feature importance on training data
        # show(
        #     predictor.feature_importance(df_train_predictor),
        #     'feature_importance(df_train_predictor)'
        # )
        # 
        # # Feature importance on test data
        # show(
        #     predictor.feature_importance(df_test_predictor),
        #     'feature_importance(df_test_predictor)'
        # )
        # Example: SHAP values for a specific model
        # shap_values = predictor.get_model_shap_values(df_test_predictor, model='LightGBM')
        # show(shap_values, 'shap_values')  # SHAP values for each feature and each prediction
        
    return predictors, df_predictions


predictors, df_predictions_autogluon = run_autogluon(df_train, df_test)
df_predictions_autogluon

No path specified. Models will be saved in: "AutogluonModels/ag-20250225_141500"
Preset alias specified: 'good' maps to 'good_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #141~20.04.1-Ubuntu SMP Thu Jan 16 18:38:51 UTC 2025
CPU Count:          16
Memory Avail:       16.71 GB / 31.09 GB (53.7%)
Disk Space Avail:   214.51 GB / 693.60 GB (30.9%)
Presets specified: ['good']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model refit if memory is small relative to the data size.
	You can avoid this risk by setting `save_bag_folds=True`.
DyStack is enabled 

column='Output of simulation (execution): Total No. of completed missions for AVs (V1-V3)':


	Running DyStack sub-fit in a ray process to avoid memory leakage. Enabling ray logging (enable_ray_logging=True). Specify `ds_args={'enable_ray_logging': False}` if you experience logging issues.
2025-02-25 15:15:04,730	INFO worker.py:1810 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
		Context path: "/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250225_141500/ds_sub_fit/sub_fit_ho"
[36m(_dystack pid=11453)[0m Running DyStack sub-fit ...
[36m(_dystack pid=11453)[0m Beginning AutoGluon training ... Time limit = 894s
[36m(_dystack pid=11453)[0m AutoGluon will save models to "/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250225_141500/ds_sub_fit/sub_fit_ho"
[36m(_dystack pid=11453)[0m Train Data Rows:    2052
[36m(_dystack pid=11453)[0m Train Data Columns: 19
[36m(_dystack pid=11453)[0m Label Column:       Output of simulation (execution): Total No. of completed missions for AVs (V1-V

[36m(_ray_fit pid=13372)[0m [1000]	valid_set's l2: 9.81572	valid_set's r2: 0.824617
[36m(_ray_fit pid=13375)[0m [5000]	valid_set's l2: 8.24334	valid_set's r2: 0.884871[32m [repeated 11x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m
[36m(_ray_fit pid=13375)[0m [10000]	valid_set's l2: 8.2296	valid_set's r2: 0.885062[32m [repeated 5x across cluster][0m


[36m(_dystack pid=11453)[0m 	0.8419	 = Validation score   (r2)
[36m(_dystack pid=11453)[0m 	13.91s	 = Training   runtime
[36m(_dystack pid=11453)[0m 	0.4s	 = Validation runtime
[36m(_dystack pid=11453)[0m Fitting model: RandomForest_BAG_L2 ... Training model for up to 868.11s of the 868.11s of remaining time.
[36m(_dystack pid=11453)[0m 	0.8396	 = Validation score   (r2)
[36m(_dystack pid=11453)[0m 	0.78s	 = Training   runtime
[36m(_dystack pid=11453)[0m 	0.13s	 = Validation runtime
[36m(_dystack pid=11453)[0m Fitting model: ExtraTrees_BAG_L2 ... Training model for up to 865.48s of the 865.48s of remaining time.
[36m(_dystack pid=11453)[0m 	0.8602	 = Validation score   (r2)
[36m(_dystack pid=11453)[0m 	0.49s	 = Training   runtime
[36m(_dystack pid=11453)[0m 	0.14s	 = Validation runtime
[36m(_dystack pid=11453)[0m Fitting model: XGBoost_BAG_L2 ... Training model for up to 864.77s of the 864.77s of remaining time.
[36m(_dystack pid=11453)[0m 	Fitting 8 child mod

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
7,WeightedEnsemble_L2,,0.842,r2,,0.072,4.378,,0.002,0.086,2,False,5
8,LightGBM_BAG_L1,,0.84,r2,,0.026,1.566,,0.026,1.566,1,False,1
9,XGBoost_BAG_L1,,0.839,r2,,0.044,2.726,,0.044,2.726,1,False,4
5,RandomForest_BAG_L1,0.607,0.801,r2,0.078,0.137,0.594,0.078,0.137,0.594,1,True,2
0,ExtraTrees_BAG_L1,0.624,0.801,r2,0.067,0.142,0.509,0.067,0.142,0.509,1,True,3
1,ExtraTrees_BAG_L1_FULL,0.624,,r2,0.068,0.142,0.509,0.068,0.142,0.509,1,True,8
2,LightGBM_BAG_L1_FULL,0.618,,r2,0.004,,0.3,0.004,,0.3,1,True,6
3,WeightedEnsemble_L2_FULL,0.609,,r2,0.012,,0.503,0.002,,0.086,2,True,10
4,RandomForest_BAG_L1_FULL,0.607,,r2,0.071,0.137,0.594,0.071,0.137,0.594,1,True,7
6,XGBoost_BAG_L1_FULL,0.59,,r2,0.006,,0.117,0.006,,0.117,1,True,9


No path specified. Models will be saved in: "AutogluonModels/ag-20250225_141602"
Preset alias specified: 'good' maps to 'good_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #141~20.04.1-Ubuntu SMP Thu Jan 16 18:38:51 UTC 2025
CPU Count:          16
Memory Avail:       15.97 GB / 31.09 GB (51.4%)
Disk Space Avail:   214.36 GB / 693.60 GB (30.9%)
Presets specified: ['good']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Note: `save_bag_folds=False`! This will greatly reduce peak disk usage during fit (by ~8x), but runs the risk of an out-of-memory error during model refit if memory is small relative to the data size.
	You can avoid this risk by setting `save_bag_folds=True`.
DyStack is enabled 

column='Output of simulation (execution): Collision rate':


Leaderboard on holdout data (DyStack):
                      model  score_holdout  score_val eval_metric  pred_time_test  pred_time_val  fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0       XGBoost_BAG_L1_FULL       0.375371   0.275698          r2        0.005755            NaN  0.105888                 0.005755                     NaN           0.105888            1       True          4
1  WeightedEnsemble_L2_FULL       0.375371   0.275698          r2        0.007282            NaN  0.160722                 0.001527                     NaN           0.054834            2       True          5
2    ExtraTrees_BAG_L2_FULL       0.361733   0.660048          r2        0.203051            NaN  2.116416                 0.058431                0.135679           0.482371            2       True          8
3  WeightedEnsemble_L3_FULL       0.330680   0.681881          r2        0.240569            NaN  3.441085               

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
7,XGBoost_BAG_L1,,0.289,r2,,0.027,1.798,,0.027,1.798,1,False,4
8,WeightedEnsemble_L2,,0.289,r2,,0.028,1.893,,0.002,0.095,2,False,5
9,LightGBM_BAG_L1,,0.233,r2,,0.009,1.306,,0.009,1.306,1,False,1
1,ExtraTrees_BAG_L1,0.308,-0.099,r2,0.075,0.133,0.452,0.075,0.133,0.452,1,True,3
3,RandomForest_BAG_L1,0.299,-0.099,r2,0.081,0.153,0.652,0.081,0.153,0.652,1,True,2
0,ExtraTrees_BAG_L1_FULL,0.308,,r2,0.065,0.133,0.452,0.065,0.133,0.452,1,True,8
2,RandomForest_BAG_L1_FULL,0.299,,r2,0.068,0.153,0.652,0.068,0.153,0.652,1,True,7
4,LightGBM_BAG_L1_FULL,0.279,,r2,0.002,,0.168,0.002,,0.168,1,True,6
5,XGBoost_BAG_L1_FULL,0.279,,r2,0.006,,0.025,0.006,,0.025,1,True,9
6,WeightedEnsemble_L2_FULL,0.279,,r2,0.007,,0.12,0.001,,0.095,2,True,10


Unnamed: 0,Output of simulation (execution): Total No. of completed missions for AVs (V1-V3),Output of simulation (execution): Collision rate
"map-generator/generated-maps/3_with_bridges/scenario8-10.json, passhum 0, slowness with rerouting, forcing stops",11.267862,0.359292
"map-generator/generated-maps/3_without_bridges/scenario8-4.json, passhum 0, slowness without rerouting, forcing ignoring human",15.974314,0.659959
"map-generator/generated-maps/3_with_bridges/scenario8-10.json, passhum 0, slowness without rerouting, forcing stops",11.399417,0.503254
"map-generator/generated-maps/3_without_bridges/scenario3-10.json, passhum 0, slowness without rerouting, forcing stops 50% with forcing 50%",15.524302,0.863647
"map-generator/generated-maps/3_without_bridges/scenario3-9.json, passhum 0, slowness without rerouting, forcing ignoring human",18.494358,0.617190
...,...,...
"map-generator/generated-maps/3_with_bridges/scenario4-4.json, passhum 0, slowness without rerouting, forcing change of priorities",15.344516,0.235205
"map-generator/generated-maps/3_with_bridges/scenario5-10.json, passhum 0, slowness with rerouting, forcing stops 50% with forcing 50%",11.155468,0.453900
"map-generator/generated-maps/3_with_bridges/scenario3-3.json, passhum 0, slowness no, forcing stops 50% with forcing 50%",20.375473,0.465794
"map-generator/generated-maps/3_with_bridges/scenario7-8.json, passhum 0, slowness without rerouting, forcing stops 50% with forcing 50%",8.675309,1.284885


## evaluate_and_plot_all_columns

In [13]:
evaluate_and_plot_all_columns(df_test, df_predictions_autogluon)

Total No. of completed missions for AVs (V1-V3):
- R^2 Score: 0.6092866063117981
Collision rate:
- R^2 Score: 0.27895974745846974


## explain_predictions

In [14]:
def explain_predictions(predictors):
    # X_test, y_test = split_df_to_X_y(df_test)
    
    for column, predictor in zip(COLUMNS_OUTPUT, predictors):
        # df_test_predictor = pd.concat([X_test, y_test[[column]]], axis=1)
        if column not in (
            'Output of simulation (execution): Total No. of completed missions for AVs (V1-V3)',
            'Output of simulation (execution): Collision rate',
        ):
            continue
        
        for model in 'LightGBM', 'XGBoost':
            tree_model = predictor._trainer.load_model(model)
            if model == 'LightGBM':
                tree_importance = tree_model.model.feature_importance(importance_type='gain')
            elif model == 'XGBoost':
                tree_importance = tree_model.model.feature_importances_
            else:
                raise ValueError(model)
            # show(tree_importance, column)
                
            
            feature_names = predictor.feature_metadata.get_features()
            df = pd.DataFrame.from_dict(
                {name: {'importance': value} for name, value in zip(feature_names, tree_importance)},
                orient='index'
            )
            # print(df.index)
            # Group indexes by their base name before `#`
            df['group'] = df.index.str.extract(r'^(.+?)(?:#\d+)?$', expand=False)
            df = df.groupby('group')['importance'].sum().to_frame()
            
            df.sort_values(by='importance', ascending=False, inplace=True)
            show(df, f'{model}: {column}')
        
        
explain_predictions(predictors)

ValueError: Model does not exist: (model=LightGBM)