In [None]:
# Utility imports
import pickle
from collections import defaultdict
import src.features.preprocessing as prep

# Math and matrix manipulation imports
import numpy as np

# Machine learning imports
import river
from river import ensemble
from river import linear_model
from river import preprocessing
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error

In [None]:
DATA_PATH = "./datasets/traffic/"

list_of_datafiles = prep.get_list_of_datafiles(DATA_PATH, sort = True)
df = prep.load_data(DATA_PATH, list_of_datafiles)
train_df, test_df = prep.make_train_test_datasets(df, split_point=20000)

In [None]:
def smape(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

def mda(y_true, y_pred):
    return np.mean((np.sign(y_true[1:] - y_pred[:-1]) == np.sign(y_pred[1:] - y_pred[:-1]).astype(int)))

def calculate_metrics(y_real, y_pred):
    metrics = {
        "MAPE": mean_absolute_percentage_error(y_real, y_pred),
        "sMAPE": smape(y_real, y_pred),
        "MAE": mean_absolute_error(y_real, y_pred),
        "MDA": mda(y_real, y_pred)
    }
    return metrics

In [None]:
def train_incremental_regressor(n_input_size: int, n_output_size: int, target: str):
    X_online, y_online = prep.split_sequence(sequence = df[target], n_input_steps=n_input_size, n_output_steps=n_output_size)
    X_online = X_online.reshape((X_online.shape[0], X_online.shape[1]))


    online_model = river.compose.Pipeline(
    river.preprocessing.StandardScaler(),
    river.linear_model.LinearRegression(river.optim.SGD(lr=0.00025))
    )

    online_metric = river.metrics.SMAPE()

    online_predicted = list()
    for xi, yi in river.stream.iter_array(X_online, y_online):

        yi_pred = online_model.predict_one(xi)
        online_metric.update(yi[0], yi_pred)
        online_predicted.append(yi_pred)
        print(f"ground-truth {yi[0]} | predicted {yi_pred}")
        online_model.learn_one(xi, yi[0])

    data = {
        "y_real": y_online.reshape(y_online.shape[0]),
        "y_pred": np.ravel(online_predicted),
        "metric": online_metric
    }
    
    metrics = calculate_metrics(y_online.reshape(y_online.shape[0]), np.ravel(online_predicted))

    with open(f"./results/online_default_in{n_input_size}_out{n_output_size}_t{target.replace('->', '-')}.pkl", "wb") as f:
        pickle.dump(metrics, f)

    return data, metrics

In [None]:
def train_incremental_greedy_regressor(n_input_size: int, n_output_size: int, target: str):
    X_online, y_online = prep.split_sequence(sequence = df[target], n_input_steps=n_input_size, n_output_steps=n_output_size)
    X_online = X_online.reshape((X_online.shape[0], X_online.shape[1]))


    online_models = [
            river.linear_model.LinearRegression(optimizer=river.optim.SGD(lr=lr))
            for lr in [0.0001, 0.00025, 0.001, 1e-05, 0.01]
        ]

    online_model = (
    river.preprocessing.StandardScaler() |
        river.model_selection.EpsilonGreedyRegressor(
        online_models,
        epsilon=0.1,
        decay=0.001,
        burn_in=100,
        seed=1
        )
    )

    online_metric = river.metrics.SMAPE()

    online_predicted = list()
    for xi, yi in river.stream.iter_array(X_online, y_online):
        # Test the current model on the new "unobserved" sample
        yi_pred = online_model.predict_one(xi)

        # Update the running metric with the prediction and ground truth value
        online_metric.update(yi[0], yi_pred)
        online_predicted.append(yi_pred)
        print(f"g-t {yi[0]} | pred {yi_pred}")
        
        # Train the model with the new sample
        online_model.learn_one(xi, yi[0])

    data = {
        "y_real": y_online,
        "y_pred": online_predicted,
        "metric": online_metric
    }
    
    metrics = calculate_metrics(y_online.reshape(y_online.shape[0]), np.ravel(online_predicted))

    with open(f"./results/online_greedy_in{n_input_size}_out{n_output_size}_t{target.replace('->', '-')}.pkl", "wb") as f:
        pickle.dump(metrics, f)

    return data, metrics

In [None]:
def train_incremental_bagging_regressor(n_input_size: int, n_output_size: int, target: str):
    X_online, y_online = prep.split_sequence(sequence = df[target], n_input_steps=n_input_size, n_output_steps=n_output_size)
    X_online = X_online.reshape((X_online.shape[0], X_online.shape[1]))

    online_metric = river.metrics.SMAPE()
    online_model = preprocessing.StandardScaler()
    online_model |= ensemble.BaggingRegressor(
        model=linear_model.LinearRegression(intercept_lr=0.00025),
        n_models=5,
        seed=42
    )

    online_predicted = list()
    for xi, yi in river.stream.iter_array(X_online, y_online):
        # Test the current online_model on the new "unobserved" sample

        yi_pred = online_model.predict_one(xi)

        # Update the running online_metric with the prediction and ground truth value
        online_metric.update(yi[0], yi_pred)
        online_predicted.append(yi_pred)
        #print(f"g-t {yi[0]} | pred {yi_pred}")

        # Train the online_model with the new sample
        online_model.learn_one(xi, yi[0])
    
    data = {
        "y_real": y_online,
        "y_pred": online_predicted,
        "metric": online_metric
    }
    
    metrics = calculate_metrics(y_online.reshape(y_online.shape[0]), np.ravel(online_predicted))

    with open(f"./results/online_bagging_forest_regressor_in{n_input_size}_out{n_output_size}_t{target.replace('->', '-')}.pkl", "wb") as f:
        pickle.dump(metrics, f)

    return data, metrics

In [None]:
list_n_input_steps = [25]
list_n_output_steps = [1]
list_targets = ['5->8', "8->5", "5->12", "8->12"]

# for n_input_steps in list_n_input_steps:
#     for n_output_steps in list_n_output_steps:
#         for target in list_targets:
#             payload, data_metrics = train_incremental_regressor(n_input_steps, n_output_steps, target)
            #payload, data_metrics = train_incremental_greedy_regressor(n_input_steps, n_output_steps, target)
            #payload, data_metrics = train_incremental_rfr_regressor(n_input_steps, n_output_steps, target)
            #payload, data_metrics = train_incremental_bagging_regressor(n_input_steps, n_output_steps, target)