In [1]:
import ast
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from autogluon.tabular import TabularPredictor
from IPython.display import display, HTML

DIRECTORY_IMAGES = 'images/model-autogluon'

COLUMNS_INPUT = [
    'Coordination strategy',
    'Vehicle ID', 
    'Linearization C', 
]
COLUMNS_OUTPUT = [
    'traveled total, m', 
    'No. of violations',
]

In [2]:
df_inout = pd.read_csv('data/df_all.csv')[COLUMNS_INPUT + COLUMNS_OUTPUT]
df_inout

Unnamed: 0,Coordination strategy,Vehicle ID,Linearization C,"traveled total, m",No. of violations
0,stops,0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9091.8,15
1,change of priorities,0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",9089.8,22
2,baseline,0,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4764.1,0
3,stops,1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6388.6,0
4,change of priorities,1,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6427.6,0
...,...,...,...,...,...
2395,change of priorities,2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",7950.6,0
2396,baseline,2,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8891.2,0
2397,stops,3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8313.1,0
2398,change of priorities,3,"(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",8734.3,0


In [3]:
def parse_tuple_string(tuple_string):
    return np.array(ast.literal_eval(tuple_string))
    

def preprocess_inout(df_inout):
    df_inout = df_inout.copy()
    
    # Find the column that starts with 'Linearization'
    linearization_column, = [col for col in df_inout.columns if col.startswith('Linearization')]
    df_inout[linearization_column] = df_inout[linearization_column].apply(parse_tuple_string)
    size_linearization = len(df_inout[linearization_column].iloc[0])
    
    # Create DataFrame with all preprocessed features
    df_preprocessed = pd.DataFrame({
        'Coordination strategy': df_inout['Coordination strategy'].astype('category').cat.codes.values,
        'Vehicle ID': df_inout['Vehicle ID'].values,
        **{f'{linearization_column}#{i}': df_inout[linearization_column].apply(lambda x: x[i]) 
           for i in range(size_linearization)},

        **{col: df_inout[col].values for col in COLUMNS_OUTPUT}
    })
    
    return df_preprocessed
    

df_preprocessed = preprocess_inout(df_inout)
df_preprocessed

Unnamed: 0,Coordination strategy,Vehicle ID,Linearization C#0,Linearization C#1,Linearization C#2,Linearization C#3,Linearization C#4,Linearization C#5,Linearization C#6,Linearization C#7,...,Linearization C#92,Linearization C#93,Linearization C#94,Linearization C#95,Linearization C#96,Linearization C#97,Linearization C#98,Linearization C#99,"traveled total, m",No. of violations
0,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9091.8,15
1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9089.8,22
2,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4764.1,0
3,2,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6388.6,0
4,1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6427.6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7950.6,0
2396,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8891.2,0
2397,2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8313.1,0
2398,1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8734.3,0


In [4]:
def show(obj, title=None):
    if title is not None:
        display(HTML(f"<h3>{title}</h3>"))
    display(obj)

df_train, df_test = train_test_split(df_preprocessed, test_size=0.2, random_state=1)
show(df_train, 'df_train')
show(df_test, 'df_test')

Unnamed: 0,Coordination strategy,Vehicle ID,Linearization C#0,Linearization C#1,Linearization C#2,Linearization C#3,Linearization C#4,Linearization C#5,Linearization C#6,Linearization C#7,...,Linearization C#92,Linearization C#93,Linearization C#94,Linearization C#95,Linearization C#96,Linearization C#97,Linearization C#98,Linearization C#99,"traveled total, m",No. of violations
2199,0,1,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8582.5,0
1070,1,0,0.0,0.0,0.0,0.0,0.747253,0.747253,0.747253,0.747253,...,1.202295,1.202295,1.202295,1.202295,1.202295,1.202295,1.202295,1.202295,577.6,3
1264,2,1,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7142.5,0
1938,0,2,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8460.9,0
1737,2,3,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7485.9,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,1,0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.845786,0.845786,0.845786,0.845786,0.845786,0.845786,0.845786,0.924357,5113.5,6
905,2,1,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6611.4,0
1096,1,1,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8775.8,0
235,2,2,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8149.7,0


Unnamed: 0,Coordination strategy,Vehicle ID,Linearization C#0,Linearization C#1,Linearization C#2,Linearization C#3,Linearization C#4,Linearization C#5,Linearization C#6,Linearization C#7,...,Linearization C#92,Linearization C#93,Linearization C#94,Linearization C#95,Linearization C#96,Linearization C#97,Linearization C#98,Linearization C#99,"traveled total, m",No. of violations
1348,2,1,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7375.4,0
468,1,0,0.0,0.0,0.0,0.0,0.0,0.365729,0.365729,0.365729,...,0.855524,0.855524,0.855524,0.855524,0.855524,0.855524,0.855524,0.855524,294.7,1
1463,1,3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8818.6,0
2267,1,3,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,7746.5,0
943,2,2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8500.5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1386,0,2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9182.7,0
1384,1,1,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,8579.9,0
902,2,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.455238,0.455238,0.455238,0.455238,0.000000,0.000000,0.000000,0.000000,8890.8,14
2036,0,2,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,9233.5,0


In [5]:
def split_df_to_X_y(df):
    columns_input_df = [col for col in df.columns
                        if col.split('#')[0] in COLUMNS_INPUT]
    columns_output_df = COLUMNS_OUTPUT
    assert set(columns_input_df + columns_output_df) == set(df.columns)
    
    X = df[columns_input_df]
    y = df[columns_output_df]
    return X, y


def run_regression(df_train, df_test):
    X_train, y_train = split_df_to_X_y(df_train)
    X_test, y_test = split_df_to_X_y(df_test)
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    ndarray_predictions = model.predict(X_test)
    df_predictions = pd.DataFrame(ndarray_predictions, columns=y_test.columns)
    return df_predictions


df_predictions_regression = run_regression(df_train, df_test)
df_predictions_regression

Unnamed: 0,"traveled total, m",No. of violations
0,7730.994273,3.713093
1,3225.839960,6.641540
2,6304.365614,-0.591070
3,8139.033548,-0.566931
4,7885.619272,1.162159
...,...,...
475,8704.660951,-0.365057
476,8315.777138,0.670042
477,6972.419314,8.864282
478,8621.142460,-1.036848


In [6]:
def save_and_show(fig, basename):  # to avoid inlining large image data into the notebook file
    filename = f'{DIRECTORY_IMAGES}/{basename}.png'
    fig.savefig(filename)
    
    # The `random` is because of https://stackoverflow.com/a/43640705.
    display(HTML(f'<img src="{filename}?{random.random()}" alt="{basename}" />'))
    
    plt.close(fig)
    
    return filename


def evaluate_and_plot_column(df_test, df_predictions, column):
    y_test_column = df_test[column]
    predictions_column = df_predictions[column]
    
    r2 = r2_score(y_test_column, predictions_column)
    print(f"{column}:")
    print(f"- R^2 Score: {r2}")

    # Plot results for each output column
    fig = plt.figure(figsize=(10, 6))
    plt.scatter(y_test_column, predictions_column, color='blue', alpha=0.5)
    plt.plot([y_test_column.min(), y_test_column.max()], [y_test_column.min(), y_test_column.max()], 'k--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'Actual vs Predicted Values for {column}')
    plt.grid(True)
    save_and_show(fig, f'Actual_vs_Predicted_Values_{column}')


def evaluate_and_plot_all_columns(df_test, df_predictions):
    for column in COLUMNS_OUTPUT:
        evaluate_and_plot_column(df_test, df_predictions, column)
        
        
evaluate_and_plot_all_columns(df_test, df_predictions_regression)

traveled total, m:
- R^2 Score: 0.5047606611635149


No. of violations:
- R^2 Score: 0.5218750516149837


In [7]:
def run_autogluon(df_train, df_test):
    X_train, y_train = split_df_to_X_y(df_train)
    X_test, y_test = split_df_to_X_y(df_test)
    
    # Train AutoGluon model
    predictors = []
    for column in COLUMNS_OUTPUT:
        print(f'{column=}:')
        df_train_predictor = pd.concat([X_train, y_train[[column]]], axis=1)
        predictor = TabularPredictor(
            label=column, 
            eval_metric='r2', 
            problem_type='regression',
        ).fit(
            df_train_predictor,
            presets='medium',
        )
        predictors.append(predictor)
    
    # Make predictions
    df_predictions = pd.DataFrame()
    for column, predictor in zip(COLUMNS_OUTPUT, predictors):
        df_predictions[column] = predictor.predict(X_test)

        # Leaderboard - Display a table of different models and their performance
        df_test_predictor = pd.concat([X_test, y_test[[column]]], axis=1)
        leaderboard = predictor.leaderboard(df_test_predictor, silent=True)
        show(leaderboard, f'Leaderboard for {column}')
        
    return df_predictions


df_predictions_autogluon = run_autogluon(df_train, df_test)
df_predictions_autogluon

No path specified. Models will be saved in: "AutogluonModels/ag-20241206_153034"
Preset alias specified: 'medium' maps to 'medium_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #135~20.04.1-Ubuntu SMP Mon Oct 7 13:56:22 UTC 2024
CPU Count:          16
Memory Avail:       5.53 GB / 31.09 GB (17.8%)
Disk Space Avail:   321.99 GB / 693.60 GB (46.4%)
Presets specified: ['medium']
Beginning AutoGluon training ...
AutoGluon will save models to "/home/olga/coordination_oru/scenario-analysis/AutogluonModels/ag-20241206_153034"
Train Data Rows:    1920
Train Data Columns: 102
Label Column:       traveled total, m
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5661.00 MB
	Train Data (Original)  Memory Usage: 1.48 MB (0.0% of available memory)

column='traveled total, m':


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', []) : 100 | ['Linearization C#0', 'Linearization C#1', 'Linearization C#2', 'Linearization C#3', 'Linearization C#4', ...]
		('int', [])   :   2 | ['Coordination strategy', 'Vehicle ID']
	Types of features in processed data (raw dtype, special dtypes):
		('float', []) : 100 | ['Linearization C#0', 'Linearization C#1', 'Linearization C#2', 'Linearization C#3', 'Linearization C#4', ...]
		('int', [])   :   2 | ['Coordination strategy', 'Vehicle ID']
	0.1s = Fit runtime
	102 features in original data used to generate 102 features in processed data.
	Train Data (Processed) Memory Usage: 1.48 MB (0.0% of available memory)

column='No. of violations':


Data preprocessing and feature engineering runtime = 0.19s ...
AutoGluon will gauge predictive performance using evaluation metric: 'r2'
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.2, Train Rows: 1536, Val Rows: 384
User-specified model hyperparameters to be fit:
{
	'NN_TORCH': [{}],
	'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 'min_data_in_leaf': 3, 'ag_args': {'name_suffix': 'Large', 'priority': 0, 'hyperparameter_tune_kwargs': None}}],
	'CAT': [{}],
	'XGB': [{}],
	'FASTAI': [{}],
	'RF': [{'criterion': 'gini', 'ag_args': {'name_suffix': 'Gini', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'entropy', 'ag_args': {'name_suffix': 'Entr', 'problem_types': ['binary', 'multiclass']}}, {'criterion': 'squared_error', 'ag_args': {'name_suffix': 'MSE', 'problem_types': ['regression', 'quantile']}}],
	

Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.844377,0.803744,r2,0.173843,0.099519,191.952489,0.002551,0.001254,0.156714,2,True,12
1,CatBoost,0.842957,0.793449,r2,0.036811,0.007267,177.677811,0.036811,0.007267,177.677811,1,True,6
2,ExtraTreesMSE,0.818142,0.786002,r2,0.077709,0.035586,0.77109,0.077709,0.035586,0.77109,1,True,7
3,RandomForestMSE,0.816173,0.774118,r2,0.110368,0.034504,1.818202,0.110368,0.034504,1.818202,1,True,5
4,NeuralNetFastAI,0.796733,0.676345,r2,0.017613,0.009062,1.752168,0.017613,0.009062,1.752168,1,True,8
5,LightGBM,0.786161,0.74111,r2,0.007645,0.001722,0.923828,0.007645,0.001722,0.923828,1,True,4
6,LightGBMLarge,0.769238,0.737839,r2,0.009151,0.003524,3.852309,0.009151,0.003524,3.852309,1,True,11
7,NeuralNetTorch,0.761487,0.681827,r2,0.056772,0.055413,13.346874,0.056772,0.055413,13.346874,1,True,10
8,XGBoost,0.761129,0.696758,r2,0.015361,0.004202,3.429209,0.015361,0.004202,3.429209,1,True,9
9,LightGBMXT,0.704957,0.571774,r2,0.007043,0.001826,0.996371,0.007043,0.001826,0.996371,1,True,3


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,LightGBM,0.954721,0.957638,r2,0.002654,0.001832,1.200268,0.002654,0.001832,1.200268,1,True,4
1,WeightedEnsemble_L2,0.951125,0.965484,r2,0.112921,0.067646,75.809819,0.003836,0.001099,0.174867,2,True,12
2,CatBoost,0.950006,0.958417,r2,0.01287,0.00352,5.423127,0.01287,0.00352,5.423127,1,True,6
3,RandomForestMSE,0.948938,0.950117,r2,0.074716,0.052119,1.039677,0.074716,0.052119,1.039677,1,True,5
4,XGBoost,0.943916,0.960997,r2,0.011922,0.004128,2.455669,0.011922,0.004128,2.455669,1,True,9
5,LightGBMLarge,0.942817,0.955789,r2,0.006787,0.002264,4.290669,0.006787,0.002264,4.290669,1,True,11
6,NeuralNetFastAI,0.942185,0.942224,r2,0.018634,0.011361,2.04996,0.018634,0.011361,2.04996,1,True,8
7,ExtraTreesMSE,0.939494,0.949903,r2,0.070424,0.047139,0.534425,0.070424,0.047139,0.534425,1,True,7
8,LightGBMXT,0.927756,0.944752,r2,0.004696,0.002106,1.026854,0.004696,0.002106,1.026854,1,True,3
9,NeuralNetTorch,0.910702,0.953141,r2,0.063004,0.045706,64.505928,0.063004,0.045706,64.505928,1,True,10


Unnamed: 0,"traveled total, m",No. of violations
1348,7064.013184,0.034199
468,4182.948730,6.281535
1463,8513.636719,0.034743
2267,7952.044434,0.039039
943,8228.262695,0.035899
...,...,...
1386,9101.824219,0.036325
1384,7988.605469,0.037073
902,8807.481445,13.952355
2036,9053.058594,0.036298


In [8]:
evaluate_and_plot_all_columns(df_test, df_predictions_autogluon)

traveled total, m:
- R^2 Score: 0.8443770315708323


No. of violations:
- R^2 Score: 0.951124906539917
