In [1]:
import ast
import os
import random
import warnings

import autogluon.tabular
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.linear_model
import sklearn.metrics
import sklearn.model_selection

warnings.filterwarnings("ignore", message="Can't initialize NVML")

In [2]:
DIRECTORY_DATA = 'data/model-autogluon'
os.makedirs(DIRECTORY_DATA, exist_ok=True)

# Preparing data

In [3]:
COLUMNS_NONSPLIT = {
    'Scenario ID': lambda series: series.values,
}
COLUMNS_INPUT = {
    'Coordination strategy': lambda series: series.astype('category').cat.codes.values,
    'isCanPassFirstHum': lambda series: series.astype('int').values,
    'isRacingThroughCrossroadAllowed': lambda series: series.astype('int').values,
    
    'Vehicle ID': lambda series: series.values,
    'Linearization C': lambda series: [
        series.apply(lambda x: -1 if x is None else x[i])
        for i in range(len(series.dropna().iloc[0]))
    ],   
}
COLUMNS_OUTPUT = {
    'traveled total, m': lambda series: series.values,
    'No. of completed missions': lambda series: series.values,
    'No. of collisions': lambda series: series.values,
    'No. of near-misses': lambda series: series.values,
}

COLUMNS_ALL = {**COLUMNS_NONSPLIT, **COLUMNS_INPUT, **COLUMNS_OUTPUT}

In [4]:
runnames = [
    '20241203_170129_all600',
    '20241213_104400_racing',
    '20241214_122216_racing_passhum',
]
df_all = pd.concat(
    [pd.read_csv(f'data/{runname}/df_all.csv') 
     for runname in runnames],
    keys=runnames,
    names=['runname', 'row']
)
df_all.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 7176 entries, ('20241203_170129_all600', 0) to ('20241214_122216_racing_passhum', 2383)
Data columns (total 83 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Unnamed: 0                         7176 non-null   int64  
 1   i_map                              7176 non-null   int64  
 2   are_bridges                        7176 non-null   bool   
 3   Positions variant                  7176 non-null   int64  
 4   configuration                      7176 non-null   object 
 5   Coordination strategy              7176 non-null   object 
 6   probabilityForcingForHuman         7176 non-null   float64
 7   filename_screenshot                7176 non-null   object 
 8   isCanPassFirstHum                  7176 non-null   bool   
 9   isCanPassFirstAut                  7176 non-null   bool   
 10  Date                               7176 non-null   objec

In [5]:
df_inout = df_all[list(COLUMNS_ALL)]
df_inout

Unnamed: 0_level_0,Unnamed: 1_level_0,Scenario ID,Coordination strategy,isCanPassFirstHum,isRacingThroughCrossroadAllowed,Vehicle ID,Linearization C,"traveled total, m",No. of completed missions,No. of collisions,No. of near-misses
runname,row,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
20241203_170129_all600,0,map-generator/generated-maps/2024-11-28_13:17:...,stops,False,False,0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9091.8,11,7,0
20241203_170129_all600,1,map-generator/generated-maps/2024-11-28_13:17:...,change of priorities,False,False,0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9089.8,11,6,2
20241203_170129_all600,2,map-generator/generated-maps/2024-11-28_13:17:...,baseline,False,False,0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4764.1,6,0,0
20241203_170129_all600,3,map-generator/generated-maps/2024-11-28_13:17:...,stops,False,False,1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6388.6,24,5,0
20241203_170129_all600,4,map-generator/generated-maps/2024-11-28_13:17:...,change of priorities,False,False,1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6427.6,25,4,2
...,...,...,...,...,...,...,...,...,...,...,...
20241214_122216_racing_passhum,2379,map-generator/generated-maps/2024-11-28_13:19:...,change of priorities,True,True,2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8491.8,7,2,0
20241214_122216_racing_passhum,2380,map-generator/generated-maps/2024-11-28_13:19:...,stops,True,True,2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7389.1,6,4,0
20241214_122216_racing_passhum,2381,map-generator/generated-maps/2024-11-28_13:19:...,baseline,True,True,3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8878.6,14,0,0
20241214_122216_racing_passhum,2382,map-generator/generated-maps/2024-11-28_13:19:...,change of priorities,True,True,3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8870.0,13,0,1


In [6]:
def parse_tuple_string(tuple_string):
    if pd.isna(tuple_string):
        return None
    return ast.literal_eval(tuple_string)


def preprocess_inout(df_inout):
    df_inout = df_inout.copy()
    
    # Find the column that starts with 'Linearization'
    linearization_columns = [col for col in df_inout.columns if col.startswith('Linearization')]
    for col in linearization_columns:
        df_inout[col] = df_inout[col].apply(parse_tuple_string)
        
    dict_preprocessed = {}
    for col, series2data in COLUMNS_ALL.items():
        data = series2data(df_inout[col])
        if not isinstance(data, list):
            dict_preprocessed[col] = data
        else:
            for i, series in enumerate(data):
                dict_preprocessed[f'{col}#{i}'] = series
    
    return pd.DataFrame(dict_preprocessed)
    

df_preprocessed = preprocess_inout(df_inout)
df_preprocessed

Unnamed: 0_level_0,Unnamed: 1_level_0,Scenario ID,Coordination strategy,isCanPassFirstHum,isRacingThroughCrossroadAllowed,Vehicle ID,Linearization C#0,Linearization C#1,Linearization C#2,Linearization C#3,Linearization C#4,...,Linearization C#94,Linearization C#95,Linearization C#96,Linearization C#97,Linearization C#98,Linearization C#99,"traveled total, m",No. of completed missions,No. of collisions,No. of near-misses
runname,row,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
20241203_170129_all600,0,map-generator/generated-maps/2024-11-28_13:17:...,2,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9091.8,11,7,0
20241203_170129_all600,1,map-generator/generated-maps/2024-11-28_13:17:...,1,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9089.8,11,6,2
20241203_170129_all600,2,map-generator/generated-maps/2024-11-28_13:17:...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4764.1,6,0,0
20241203_170129_all600,3,map-generator/generated-maps/2024-11-28_13:17:...,2,0,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6388.6,24,5,0
20241203_170129_all600,4,map-generator/generated-maps/2024-11-28_13:17:...,1,0,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6427.6,25,4,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20241214_122216_racing_passhum,2379,map-generator/generated-maps/2024-11-28_13:19:...,1,1,1,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8491.8,7,2,0
20241214_122216_racing_passhum,2380,map-generator/generated-maps/2024-11-28_13:19:...,2,1,1,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,7389.1,6,4,0
20241214_122216_racing_passhum,2381,map-generator/generated-maps/2024-11-28_13:19:...,0,1,1,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8878.6,14,0,0
20241214_122216_racing_passhum,2382,map-generator/generated-maps/2024-11-28_13:19:...,1,1,1,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8870.0,13,0,1


In [7]:
def show(obj, title=None):
    if title is not None:
        display(HTML(f"<h3>{title}</h3>"))
    display(obj)


def shuffle_df(df):
    # Shuffle by Scenario ID
    unique_scenarios = df['Scenario ID'].unique()  # Get unique Scenario IDs
    shuffled_scenarios = pd.Series(unique_scenarios).sample(frac=1, random_state=1).tolist()  # Shuffle Scenario IDs
    
    # Reorder the dataframe by the shuffled Scenario IDs
    df_shuffled = pd.concat([df[df['Scenario ID'] == scenario] for scenario in shuffled_scenarios])
    
    # Reset index (optional)
    df_shuffled = df_shuffled.reset_index(drop=True)
    
    return df_shuffled


def split_train_test(df):
    # Define groups based on 'Scenario ID'
    groups = df['Scenario ID']
    
    # Create GroupShuffleSplit instance
    gss = sklearn.model_selection.GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=1)
    
    # Split the data
    for train_idx, test_idx in gss.split(df, groups=groups):
        return shuffle_df(df.iloc[train_idx]), shuffle_df(df.iloc[test_idx])


df_train, df_test = split_train_test(df_preprocessed)
show(df_train, 'df_train')
show(df_test, 'df_test')

Unnamed: 0,Scenario ID,Coordination strategy,isCanPassFirstHum,isRacingThroughCrossroadAllowed,Vehicle ID,Linearization C#0,Linearization C#1,Linearization C#2,Linearization C#3,Linearization C#4,...,Linearization C#94,Linearization C#95,Linearization C#96,Linearization C#97,Linearization C#98,Linearization C#99,"traveled total, m",No. of completed missions,No. of collisions,No. of near-misses
0,map-generator/generated-maps/2024-11-28_13:17:...,2,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.746372,0.746372,0.746372,0.746372,0.82686,0.82686,75.5,0,0,0
1,map-generator/generated-maps/2024-11-28_13:17:...,2,0,0,1,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,9096.5,11,0,0
2,map-generator/generated-maps/2024-11-28_13:17:...,2,0,0,2,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,9460.0,6,0,0
3,map-generator/generated-maps/2024-11-28_13:17:...,2,0,0,3,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,80.7,0,0,0
4,map-generator/generated-maps/2024-11-28_13:17:...,2,0,1,0,0.0,0.0,0.0,0.0,0.0,...,0.746372,0.746372,0.746372,0.746372,0.82686,0.82686,75.5,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5731,map-generator/generated-maps/2024-11-28_13:19:...,2,0,1,3,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,7340.8,6,1,0
5732,map-generator/generated-maps/2024-11-28_13:19:...,2,1,1,0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,9002.3,11,7,1
5733,map-generator/generated-maps/2024-11-28_13:19:...,2,1,1,1,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,8637.3,14,0,0
5734,map-generator/generated-maps/2024-11-28_13:19:...,2,1,1,2,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.00000,0.00000,8374.3,8,2,1


Unnamed: 0,Scenario ID,Coordination strategy,isCanPassFirstHum,isRacingThroughCrossroadAllowed,Vehicle ID,Linearization C#0,Linearization C#1,Linearization C#2,Linearization C#3,Linearization C#4,...,Linearization C#94,Linearization C#95,Linearization C#96,Linearization C#97,Linearization C#98,Linearization C#99,"traveled total, m",No. of completed missions,No. of collisions,No. of near-misses
0,map-generator/generated-maps/2024-11-28_13:19:...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3910.4,4,0,0
1,map-generator/generated-maps/2024-11-28_13:19:...,0,0,0,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,8252.6,7,0,0
2,map-generator/generated-maps/2024-11-28_13:19:...,0,0,0,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5514.5,8,0,0
3,map-generator/generated-maps/2024-11-28_13:19:...,0,0,0,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9415.9,5,0,0
4,map-generator/generated-maps/2024-11-28_13:19:...,0,0,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,3910.4,4,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1435,map-generator/generated-maps/2024-11-28_13:19:...,0,0,1,3,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9165.6,6,0,0
1436,map-generator/generated-maps/2024-11-28_13:19:...,0,1,1,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,6178.3,9,1,2
1437,map-generator/generated-maps/2024-11-28_13:19:...,0,1,1,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9177.4,10,0,2
1438,map-generator/generated-maps/2024-11-28_13:19:...,0,1,1,2,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,9144.0,9,0,0


# Logistic regression (as a baseline)

In [8]:
def split_df_to_X_y(df):
    columns_input_df = [col for col in df.columns
                        if col.split('#')[0] in COLUMNS_INPUT]
    columns_output_df = list(COLUMNS_OUTPUT)
    assert set(COLUMNS_NONSPLIT) | set(columns_input_df) | set(columns_output_df) == set(df.columns)
    
    X = df[columns_input_df]
    y = df[columns_output_df]
    return X, y


def run_regression(df_train, df_test):
    X_train, y_train = split_df_to_X_y(df_train)
    X_test, y_test = split_df_to_X_y(df_test)
    
    model = sklearn.linear_model.LinearRegression()
    model.fit(X_train, y_train)
    ndarray_predictions = model.predict(X_test)
    df_predictions = pd.DataFrame(ndarray_predictions, columns=y_test.columns)
    return df_predictions


df_predictions_regression = run_regression(df_train, df_test)
df_predictions_regression

Unnamed: 0,"traveled total, m",No. of completed missions,No. of collisions,No. of near-misses
0,8200.618662,9.702666,5.302953,0.689518
1,7122.290812,4.317306,4.415833,0.030768
2,6467.878294,9.055693,-0.847103,-0.699063
3,8015.317654,9.562449,0.025095,-0.385185
4,8290.341475,9.825543,4.769176,0.703584
...,...,...,...,...
1435,8817.628800,5.587069,-0.148499,-0.204783
1436,8303.159628,12.154781,3.326117,0.599862
1437,8782.795955,10.857288,-0.328054,0.117620
1438,9195.174844,9.485071,-1.326640,-0.040280


In [9]:
def save_and_show(fig, basename):  # to avoid inlining large image data into the notebook file
    filename = f'{DIRECTORY_DATA}/{basename}-{random.random()}.png'
    fig.savefig(filename)
    
    # The `random` is because of https://stackoverflow.com/a/43640705.
    display(HTML(f'<img src="{filename}?{random.random()}" alt="{basename}" />'))
    
    plt.close(fig)
    
    return filename


def evaluate_and_plot_column(df_test, df_predictions, column):
    y_test_column = df_test[column]
    predictions_column = df_predictions[column]
    
    r2 = sklearn.metrics.r2_score(y_test_column, predictions_column)
    print(f"{column}:")
    print(f"- R^2 Score: {r2}")

    # Plot results for each output column
    fig = plt.figure(figsize=(10, 6))
    plt.scatter(y_test_column, predictions_column, color='blue', alpha=0.5)
    plt.plot([y_test_column.min(), y_test_column.max()], [y_test_column.min(), y_test_column.max()], 'k--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'Actual vs Predicted Values for {column}')
    plt.grid(True)
    save_and_show(fig, f'Actual_vs_Predicted_Values_{column}')


def evaluate_and_plot_all_columns(df_test, df_predictions):
    for column in COLUMNS_OUTPUT:
        evaluate_and_plot_column(df_test, df_predictions, column)
        
        
evaluate_and_plot_all_columns(df_test, df_predictions_regression)

traveled total, m:
- R^2 Score: 0.3807744825804721


No. of completed missions:
- R^2 Score: 0.5264902565566945


No. of collisions:
- R^2 Score: 0.21391640787085497


No. of near-misses:
- R^2 Score: -0.5370411652677056


# AutoGluon

In [10]:
def run_autogluon(df_train, df_test):
    X_train, y_train = split_df_to_X_y(df_train)
    X_test, y_test = split_df_to_X_y(df_test)
    
    # Train AutoGluon models
    predictors = []
    df_predictions = pd.DataFrame()
    for column in COLUMNS_OUTPUT:
        print(f'{column=}:')
        df_train_predictor = pd.concat([X_train, y_train[[column]]], axis=1)
        
        predictor = autogluon.tabular.TabularPredictor(
            label=column, 
            eval_metric='r2', 
            problem_type='regression',
        ).fit(
            df_train_predictor,
            presets='medium',
            hyperparameters={
                'GBM': {},       # LightGBM (TODO: something like `GBMLarge`)
                'XGB': {},       # XGBoost
                'RF': {},        # Random Forest
                'XT': {},        # Extra Trees
                # 'CAT': {},      # CatBoost, omitted if slow
                # 'NN': {},       # Neural net, if you want it
                # 'LR': {},       # Linear model
                # 'KNN': {},      # K-Nearest Neighbors
            },
        )
        predictors.append(predictor)
        
        df_predictions[column] = predictor.predict(X_test)

        # Leaderboard - Display a table of different models and their performance
        df_test_predictor = pd.concat([X_test, y_test[[column]]], axis=1)
        leaderboard = predictor.leaderboard(df_test_predictor, silent=True)
        show(leaderboard, f'Leaderboard for {column}')
        
        # Feature importance on training data
        # show(
        #     predictor.feature_importance(df_train_predictor),
        #     'feature_importance(df_train_predictor)'
        # )
        # 
        # # Feature importance on test data
        # show(
        #     predictor.feature_importance(df_test_predictor),
        #     'feature_importance(df_test_predictor)'
        # )
        # Example: SHAP values for a specific model
        # shap_values = predictor.get_model_shap_values(df_test_predictor, model='LightGBM')
        # show(shap_values, 'shap_values')  # SHAP values for each feature and each prediction
        
    return predictors, df_predictions


predictors, df_predictions_autogluon = run_autogluon(df_train, df_test)
df_predictions_autogluon

No path specified. Models will be saved in: "AutogluonModels/ag-20250108_151246"
Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #135~20.04.1-Ubuntu SMP Mon Oct 7 13:56:22 UTC 2024
CPU Count:          16
Memory Avail:       3.14 GB / 31.09 GB (10.1%)
Disk Space Avail:   299.62 GB / 693.60 GB (43.2%)
Presets specified: ['medium']


column='traveled total, m':


Beginning AutoGluon training ...
AutoGluon will save models to "/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250108_151246"
Train Data Rows:    5736
Train Data Columns: 104
Label Column:       traveled total, m
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    3210.47 MB
	Train Data (Original)  Memory Usage: 4.51 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 2 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDupli

[1000]	valid_set's l2: 347665	valid_set's r2: 0.94342
[2000]	valid_set's l2: 323701	valid_set's r2: 0.94732


	0.9476	 = Validation score   (r2)
	8.14s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: RandomForest ...
	0.9566	 = Validation score   (r2)
	4.89s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: ExtraTrees ...
	0.9594	 = Validation score   (r2)
	2.44s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: XGBoost ...
	0.9502	 = Validation score   (r2)
	18.7s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'ExtraTrees': 0.7, 'XGBoost': 0.3}
	0.9614	 = Validation score   (r2)
	0.06s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 36.75s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 5015.5 rows/s (574 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250108_151246")


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.845546,0.961371,r2,0.248911,0.114445,21.193663,0.002863,0.000883,0.059708,2,True,5
1,ExtraTrees,0.8417,0.959371,r2,0.175364,0.091735,2.437653,0.175364,0.091735,2.437653,1,True,3
2,RandomForest,0.83697,0.956584,r2,0.218093,0.035407,4.893469,0.218093,0.035407,4.893469,1,True,2
3,LightGBM,0.828601,0.947624,r2,0.051555,0.011955,8.137978,0.051555,0.011955,8.137978,1,True,1
4,XGBoost,0.79014,0.950212,r2,0.070683,0.021827,18.696301,0.070683,0.021827,18.696301,1,True,4


No path specified. Models will be saved in: "AutogluonModels/ag-20250108_151324"
Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #135~20.04.1-Ubuntu SMP Mon Oct 7 13:56:22 UTC 2024
CPU Count:          16
Memory Avail:       2.74 GB / 31.09 GB (8.8%)
Disk Space Avail:   299.39 GB / 693.60 GB (43.2%)
Presets specified: ['medium']
Beginning AutoGluon training ...
AutoGluon will save models to "/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250108_151324"
Train Data Rows:    5736
Train Data Columns: 104
Label Column:       No. of completed missions
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2804.89 MB
	Train Data (Original)  Memory Usage: 4.51 MB (0.2% of available 

column='No. of completed missions':


Data preprocessing and feature engineering runtime = 0.18s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 5162, Val Rows: 574
User-specified model hyperparameters to be fit:
{
	'GBM': [{}],
	'XGB': [{}],
	'RF': [{}],
	'XT': [{}],
}
Fitting 4 L1 models, fit_strategy="sequential" ...
Fitting model: LightGBM ...


[1000]	valid_set's l2: 0.764503	valid_set's r2: 0.958766
[2000]	valid_set's l2: 0.681422	valid_set's r2: 0.963247
[3000]	valid_set's l2: 0.680454	valid_set's r2: 0.963299
[4000]	valid_set's l2: 0.673706	valid_set's r2: 0.963663
[5000]	valid_set's l2: 0.670637	valid_set's r2: 0.963829
[6000]	valid_set's l2: 0.667256	valid_set's r2: 0.964011
[7000]	valid_set's l2: 0.664964	valid_set's r2: 0.964135
[8000]	valid_set's l2: 0.667529	valid_set's r2: 0.963997


	0.9642	 = Validation score   (r2)
	23.77s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: RandomForest ...
	0.9667	 = Validation score   (r2)
	6.0s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: ExtraTrees ...
	0.9656	 = Validation score   (r2)
	1.87s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: XGBoost ...
	0.9748	 = Validation score   (r2)
	19.97s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'XGBoost': 0.778, 'ExtraTrees': 0.222}
	0.9757	 = Validation score   (r2)
	0.05s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 54.25s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 6200.9 rows/s (574 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250108_151324")


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTrees,0.907104,0.965611,r2,0.072996,0.06986,1.869576,0.072996,0.06986,1.869576,1,True,3
1,WeightedEnsemble_L2,0.902336,0.975672,r2,0.128832,0.092568,21.892047,0.002034,0.000629,0.053733,2,True,5
2,LightGBM,0.901814,0.964176,r2,0.136398,0.058014,23.765363,0.136398,0.058014,23.765363,1,True,1
3,XGBoost,0.89357,0.974843,r2,0.053802,0.022079,19.968738,0.053802,0.022079,19.968738,1,True,4
4,RandomForest,0.868722,0.966657,r2,0.072397,0.057879,6.004179,0.072397,0.057879,6.004179,1,True,2


No path specified. Models will be saved in: "AutogluonModels/ag-20250108_151418"
Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #135~20.04.1-Ubuntu SMP Mon Oct 7 13:56:22 UTC 2024
CPU Count:          16
Memory Avail:       2.88 GB / 31.09 GB (9.3%)
Disk Space Avail:   299.27 GB / 693.60 GB (43.1%)
Presets specified: ['medium']
Beginning AutoGluon training ...
AutoGluon will save models to "/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250108_151418"
Train Data Rows:    5736
Train Data Columns: 104
Label Column:       No. of collisions
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2952.86 MB
	Train Data (Original)  Memory Usage: 4.51 MB (0.2% of available memory)


column='No. of collisions':
[1000]	valid_set's l2: 1.34678	valid_set's r2: 0.866105


	0.869	 = Validation score   (r2)
	6.03s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: RandomForest ...


[2000]	valid_set's l2: 1.32713	valid_set's r2: 0.868059


	0.881	 = Validation score   (r2)
	5.83s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: ExtraTrees ...
	0.8741	 = Validation score   (r2)
	1.85s	 = Training   runtime
	0.07s	 = Validation runtime
Fitting model: XGBoost ...
	0.8922	 = Validation score   (r2)
	13.59s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'XGBoost': 0.706, 'RandomForest': 0.294}
	0.8945	 = Validation score   (r2)
	0.07s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 29.64s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 7464.0 rows/s (574 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250108_151418")


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTrees,0.680489,0.874059,r2,0.10314,0.073524,1.850991,0.10314,0.073524,1.850991,1,True,3
1,RandomForest,0.665915,0.881007,r2,0.088049,0.06021,5.826284,0.088049,0.06021,5.826284,1,True,2
2,WeightedEnsemble_L2,0.662793,0.894472,r2,0.136564,0.076902,19.485552,0.003052,0.000949,0.069427,2,True,5
3,LightGBM,0.648759,0.869029,r2,0.042716,0.016002,6.029978,0.042716,0.016002,6.029978,1,True,1
4,XGBoost,0.64096,0.89216,r2,0.045462,0.015744,13.589841,0.045462,0.015744,13.589841,1,True,4


No path specified. Models will be saved in: "AutogluonModels/ag-20250108_151449"
Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #135~20.04.1-Ubuntu SMP Mon Oct 7 13:56:22 UTC 2024
CPU Count:          16
Memory Avail:       3.13 GB / 31.09 GB (10.1%)
Disk Space Avail:   299.15 GB / 693.60 GB (43.1%)
Presets specified: ['medium']
Beginning AutoGluon training ...
AutoGluon will save models to "/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250108_151449"
Train Data Rows:    5736
Train Data Columns: 104
Label Column:       No. of near-misses
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    3207.46 MB
	Train Data (Original)  Memory Usage: 4.51 MB (0.1% of available memory

column='No. of near-misses':


Automatically generating train/validation split with holdout_frac=0.1, Train Rows: 5162, Val Rows: 574
User-specified model hyperparameters to be fit:
{
	'GBM': [{}],
	'XGB': [{}],
	'RF': [{}],
	'XT': [{}],
}
Fitting 4 L1 models, fit_strategy="sequential" ...
Fitting model: LightGBM ...
	0.6504	 = Validation score   (r2)
	3.74s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: RandomForest ...


[1000]	valid_set's l2: 0.281864	valid_set's r2: 0.64617


	0.6491	 = Validation score   (r2)
	6.29s	 = Training   runtime
	0.09s	 = Validation runtime
Fitting model: ExtraTrees ...
	0.6648	 = Validation score   (r2)
	1.96s	 = Training   runtime
	0.08s	 = Validation runtime
Fitting model: XGBoost ...
	0.6613	 = Validation score   (r2)
	9.04s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	Ensemble Weights: {'ExtraTrees': 0.471, 'XGBoost': 0.412, 'LightGBM': 0.118}
	0.6753	 = Validation score   (r2)
	0.05s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 23.37s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 5623.2 rows/s (574 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20250108_151449")


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTrees,-0.634064,0.664817,r2,0.064721,0.083189,1.963291,0.064721,0.083189,1.963291,1,True,3
1,WeightedEnsemble_L2,-0.691048,0.675298,r2,0.110409,0.102077,14.784313,0.002306,0.000595,0.048179,2,True,5
2,XGBoost,-0.803467,0.661332,r2,0.022964,0.008614,9.035498,0.022964,0.008614,9.035498,1,True,4
3,LightGBM,-1.042679,0.650432,r2,0.020417,0.009679,3.737345,0.020417,0.009679,3.737345,1,True,1
4,RandomForest,-1.180127,0.649087,r2,0.066401,0.092659,6.292499,0.066401,0.092659,6.292499,1,True,2


Unnamed: 0,"traveled total, m",No. of completed missions,No. of collisions,No. of near-misses
0,5182.628906,6.920036,0.088024,-0.046428
1,8134.416992,7.474953,0.927449,0.003770
2,6122.679199,8.166824,-0.155424,0.005856
3,9235.293945,6.408742,1.123160,0.040558
4,5250.327148,7.155142,0.072012,0.002781
...,...,...,...,...
1435,9153.595703,6.034895,0.010371,0.008559
1436,5946.552246,8.723999,-0.133251,0.970391
1437,9064.640625,9.477354,-0.009502,0.910473
1438,8956.281250,9.097409,-0.011393,0.033908


## evaluate_and_plot_all_columns

In [11]:
evaluate_and_plot_all_columns(df_test, df_predictions_autogluon)

traveled total, m:
- R^2 Score: 0.8455460668569312


No. of completed missions:
- R^2 Score: 0.9023364782333374


No. of collisions:
- R^2 Score: 0.6627932786941528


No. of near-misses:
- R^2 Score: -0.6910476684570312


## explain_predictions

In [12]:
def explain_predictions(predictors, df_test):
    # X_test, y_test = split_df_to_X_y(df_test)
    
    for column, predictor in zip(COLUMNS_OUTPUT, predictors):
        # df_test_predictor = pd.concat([X_test, y_test[[column]]], axis=1)
        
        tree_model = predictor._trainer.load_model('LightGBM')
        tree_importance = tree_model.model.feature_importance(importance_type='gain')
        # show(tree_importance, column)
        
        feature_names = predictor.feature_metadata.get_features()
        df = pd.DataFrame.from_dict(
            {name: {'importance': value} for name, value in zip(feature_names, tree_importance)},
            orient='index'
        )
        # print(df.index)
        # Group indexes by their base name before `#`
        df['group'] = df.index.str.extract(r'^(.+?)(?:#\d+)?$', expand=False)
        df = df.groupby('group')['importance'].sum().to_frame()
        
        df.sort_values(by='importance', ascending=False, inplace=True)
        show(df, column)
        
        
explain_predictions(predictors, df_test)

Unnamed: 0_level_0,importance
group,Unnamed: 1_level_1
Linearization C,248623700000.0
Coordination strategy,51750460000.0
Vehicle ID,30875880000.0
isCanPassFirstHum,1256243000.0
isRacingThroughCrossroadAllowed,458281300.0


Unnamed: 0_level_0,importance
group,Unnamed: 1_level_1
Linearization C,837631.333164
Coordination strategy,93680.758545
Vehicle ID,50492.565954
isCanPassFirstHum,2634.737267
isRacingThroughCrossroadAllowed,1058.428485


Unnamed: 0_level_0,importance
group,Unnamed: 1_level_1
Linearization C,301076.57698
Coordination strategy,121063.380209
Vehicle ID,107510.925374
isRacingThroughCrossroadAllowed,12720.081166
isCanPassFirstHum,1810.975779


Unnamed: 0_level_0,importance
group,Unnamed: 1_level_1
Linearization C,34877.544957
Coordination strategy,3336.126039
Vehicle ID,2188.727619
isRacingThroughCrossroadAllowed,642.932879
isCanPassFirstHum,221.631843
