In [13]:
import ast
import os
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from autogluon.tabular import TabularPredictor
import IPython.display

DIRECTORY_IMAGES = 'images/model-autogluon'


def load_data(filepath):
    df = pd.read_csv(filepath)
    df = df[
        (df['Coordination strategy'] == 'baseline')
        # & (df['Vehicle ID'] != 0)
    ]
    return df


def prepare_data(df, input_columns, output_columns):
    df_input = df[input_columns].copy()
    df_output = df[output_columns].copy()
    return df_input, df_output


def parse_tuple_string(tuple_string):
    return np.array(ast.literal_eval(tuple_string))


def preprocess_input(df_input):
    column = 'Linearization A'
    df_input[column] = df_input[column].apply(parse_tuple_string)
    X = np.concatenate([
        df_input['Vehicle ID'].values.reshape(-1, 1),
        np.stack(df_input[column].values)
    ], axis=1)
    return X


def split_data(X, y, test_size=0.2, random_state=42):
    return train_test_split(X, y, test_size=test_size, random_state=random_state)


def standardize_data(X_train, X_test):
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test


def save_and_show(fig, basename):  # to avoid inlining large image data into the notebook file
    filename = f'{DIRECTORY_IMAGES}/{basename}.png'
    fig.savefig(filename)
    
    # The `random` is because of https://stackoverflow.com/a/43640705.
    IPython.display.display(IPython.display.HTML(f'<img src="{filename}?{random.random()}" alt="{basename}" />'))
    
    plt.close(fig)
    
    return filename


def run_autogluon(X_train, X_test, y_train, y_test):
    # Convert to DataFrame for AutoGluon compatibility
    train_data = pd.DataFrame(X_train)
    train_data['target'] = y_train
    test_data = pd.DataFrame(X_test)
    test_data['target'] = y_test

    # Train AutoGluon model
    predictor = TabularPredictor(label='target', eval_metric='r2').fit(train_data)
    
    # Make predictions
    predictions = predictor.predict(test_data.drop(columns=['target']))

    # Evaluate the model
    r2 = r2_score(y_test, predictions)
    mse = mean_squared_error(y_test, predictions)
    print(f"Mean Squared Error: {mse}")
    print(f"R^2 Score: {r2}")

    # Leaderboard - Display a table of different models and their performance
    leaderboard = predictor.leaderboard(test_data, silent=True)

    # Plot results
    fig = plt.figure(figsize=(10, 6))
    plt.scatter(y_test, predictions, color='blue', alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title('Actual vs Predicted Values')
    plt.grid(True)
    save_and_show(fig, 'Actual vs Predicted Values')
    
    return leaderboard


# Load and prepare data
df = load_data('data/df_all.csv')
input_columns = ['Vehicle ID', 'Linearization A']
output_columns = ['traveled total, m']
df_input, df_output = prepare_data(df, input_columns, output_columns)

# Preprocess input features
X = preprocess_input(df_input)
y = df_output.values.flatten()

# Split data into training and testing sets
X_train, X_test, y_train, y_test = split_data(X, y)

# Standardize the features
X_train, X_test = standardize_data(X_train, X_test)

# Run AutoGluon to choose the best model
run_autogluon(X_train, X_test, y_train, y_test)


No path specified. Models will be saved in: "AutogluonModels/ag-20241206_073127"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.12.2
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #135~20.04.1-Ubuntu SMP Mon Oct 7 13:56:22 UTC 2024
CPU Count:          16
Memory Avail:       8.41 GB / 31.09 GB (27.0%)
Disk Space Avail:   322.90 GB / 693.60 GB (46.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong acc

Mean Squared Error: 483733.52682304505
R^2 Score: 0.8984530454899395


Unnamed: 0,model,score_test,score_val,eval_metric,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,ExtraTreesMSE,0.902673,0.831752,r2,0.050347,0.034466,0.390823,0.050347,0.034466,0.390823,1,True,7
1,WeightedEnsemble_L2,0.898453,0.882129,r2,0.054772,0.036829,4.814109,0.002114,0.000638,0.088797,2,True,12
2,LightGBM,0.889104,0.868395,r2,0.001638,0.000721,0.60556,0.001638,0.000721,0.60556,1,True,4
3,CatBoost,0.888327,0.878706,r2,0.007601,0.001914,0.485107,0.007601,0.001914,0.485107,1,True,6
4,LightGBMXT,0.887426,0.862109,r2,0.00179,0.000842,0.578544,0.00179,0.000842,0.578544,1,True,3
5,XGBoost,0.887105,0.852467,r2,0.008735,0.002085,0.427995,0.008735,0.002085,0.427995,1,True,9
6,NeuralNetTorch,0.886164,0.849136,r2,0.043419,0.033556,3.634645,0.043419,0.033556,3.634645,1,True,10
7,RandomForestMSE,0.886151,0.828967,r2,0.049078,0.035204,0.474757,0.049078,0.035204,0.474757,1,True,5
8,LightGBMLarge,0.869599,0.828983,r2,0.004427,0.000864,0.951037,0.004427,0.000864,0.951037,1,True,11
9,KNeighborsDist,0.86377,0.742775,r2,0.001714,0.001654,0.016916,0.001714,0.001654,0.016916,1,True,2
