In [None]:
import sys, os
sys.path.append(os.path.abspath(os.path.join('..', 'src', 'features')))

In [None]:
# Utility imports
import pickle
from preprocessing import get_list_of_datapaths, load_data, make_train_test_datasets, split_sequence
from metrics import calculate_metrics

# Math and matrix manipulation imports
import numpy as np
import pandas as pd

# Graphing imports
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning imports
import tensorflow as tf
import river
from river import ensemble
from river import linear_model
from river import metrics
from river import optim
from river import preprocessing
from river import tree
from sklearn.metrics import mean_absolute_percentage_error, mean_absolute_error

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), 'Physical GPUs,', len(logical_gpus), 'Logical GPUs')
    except RuntimeError as e:
        print(e)

In [None]:
DATA_PATH = "../datasets/traffic/"
RESULTS_PATH = "../results"
os.makedirs(RESULTS_PATH, exist_ok=True)
os.makedirs(os.join(RESULTS_PATH, "classical"), exist_ok=True)

list_of_datafiles = get_list_of_datapaths(DATA_PATH, sort = True)
df = load_data(list_of_datafiles)
train_df, test_df = make_train_test_datasets(df, split_point=20000)

In [None]:
df.describe()

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
fig, axs = plt.subplots(ncols=2, figsize=(15,7))
sns.boxplot(data=train_df, ax=axs[0])
sns.boxplot(data=test_df, ax=axs[1])
axs[0].set_title('Train dataset')
axs[1].set_title('Test dataset')

In [None]:
def plot_data(target: str):
    fig, axs = plt.subplots(ncols=1, nrows=3, figsize=(22,10))
    fig.tight_layout()
    sns.lineplot(data=train_df, x=np.arange(0, train_df.shape[0], 1), y=target, ax=axs[0]).set_title(f"Train dataset visualization {target}")
    for ax in axs:
        ax.set(ylabel="Transfer value in node")
    sns.lineplot(data=test_df, x=np.arange(0, test_df.shape[0], 1), y=target, ax=axs[1]).set_title(f"Test dataset visualization {target}")
    sns.lineplot(data=df, x=np.arange(0, df.shape[0], 1), y=target, ax=axs[2]).set_title(f"Full dataset visualization {target}")
    plt.savefig(f"plot_{target.replace('>', '')}")

In [None]:
plot_data('5->8')

In [None]:
plot_data('8->5')

In [None]:
plot_data('5->12')

In [None]:
plot_data('8->12')

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
def run_online_experiment(df, n_input_size, n_output_size, target, model_type):
    X_online, y_online = split_sequence(sequence = df[target], n_input_steps=n_input_size, n_output_steps=n_output_size)
    X_online = X_online.reshape((X_online.shape[0], X_online.shape[1]))


    if model_type == "default":
        
        online_model = river.compose.Pipeline(
        river.preprocessing.StandardScaler(),
        river.linear_model.LinearRegression(river.optim.SGD(lr=0.3))
        )

        online_metric = river.metrics.SMAPE()

        online_predicted = list()
        for xi, yi in river.stream.iter_array(X_online, y_online):
            yi_pred = online_model.predict_one(xi)

            online_metric.update(yi[0], yi_pred)
            online_predicted.append(yi_pred)
            #print(f"g-t {yi[0]} | pred {yi_pred}")

            online_model.learn_one(xi, yi[0])

        data = {
            "y_real": y_online,
            "y_pred": online_predicted,
            "metric": online_metric
        }
    

    elif model_type == "bagging_regressor":
        br_metric = metrics.SMAPE()
        br_model = preprocessing.StandardScaler()
        br_model |= ensemble.BaggingRegressor(
            model=linear_model.LinearRegression(intercept_lr=0.3),
            n_models=5,
            seed=42
        )

        br_predicted = list()
        for xi, yi in river.stream.iter_array(X_online, y_online):
            yi_pred = br_model.predict_one(xi)

            br_metric.update(yi[0], yi_pred)
            br_predicted.append(yi_pred)
            #print(f"g-t {yi[0]} | pred {yi_pred}")

            br_model.learn_one(xi, yi[0])
        
        data = {
            "y_real": y_online,
            "y_pred": br_predicted,
            "metric": br_metric
        }


    else:
        online_models_greedy = [
            river.linear_model.LinearRegression(optimizer=river.optim.SGD(lr=lr))
            for lr in [0.0001, 0.001, 1e-05, 0.01]
        ]

        online_model_greedy = (
        river.preprocessing.StandardScaler() |
            river.model_selection.EpsilonGreedyRegressor(
            online_models_greedy,
            epsilon=0.1,
            decay=0.001,
            burn_in=100,
            seed=1
            )
        )

        online_metric_greedy = river.metrics.SMAPE()

        online_predicted_greedy = list()
        for xi, yi in river.stream.iter_array(X_online, y_online):
            yi_pred = online_model_greedy.predict_one(xi)

            online_metric_greedy.update(yi[0], yi_pred)
            online_predicted_greedy.append(yi_pred)
            #print(f"g-t {yi[0]} | pred {yi_pred}")
            
            online_model_greedy.learn_one(xi, yi[0])

        data = {
            "y_real": y_online,
            "y_pred": online_predicted_greedy,
            "metric": online_metric_greedy
        }

    with open(RESULTS_PATH + f"classical/online_{model_type}_in{n_input_size}_out{n_output_size}_t{target.replace('->', '-')}.pkl", "wb") as f:
        pickle.dump(data, f)

In [None]:
df_targets = ["5->8", "8->5", "5->12", "8->12"]
in_sizes = [1, 2, 5, 10, 25, 50, 100]
exp_models = ["greedy", "random_forest_regressor"]

for model in exp_models:
    for target in df_targets:
        for in_size in in_sizes:
            run_online_experiment(df, in_size, 1, target, model)