Testing experiments

In [1]:
from simulate_module import *

import pandas as pd
from pathlib import Path
import numpy as np
np.random.seed(1234)


In [2]:
sigma_setting = {"high_bw": [10, .2],
                "medium_bw": [1, .2],
                "low_bw": [.5, .2]}

In [126]:
args = {
    "n_tasks": 15,
    "conservative": True,
    "target_test_size": 0.4,
    "model_type": "lm",
    "base_output_dir": "test"
}

In [127]:
if args["model_type"] == "lm":
    model_class = lm()
    loss_fn = mse
elif args["model_type"] == "nn":
    model_class = nn()
    loss_fn =  torch.nn.MSELoss()

In [128]:
s = "high_bw"
    # set directory
if args["conservative"]:
    data_path = Path(args["base_output_dir"]) / Path("model_" + args["model_type"] + "/conservative_derived_data")
else:
    data_path = Path(args["base_output_dir"]) / Path("model_" + args["model_type"] + "/derived_data")
data_path = Path(data_path)
working_path = data_path / s
working_path.mkdir(parents = True, exist_ok = True)
# generate data ------------------------------------------------
np.random.seed(1234)
f, betas, zs = random_functions(args["n_tasks"], 6,
                                sigma_between = sigma_setting[s][0],
                                sigma_within = sigma_setting[s][-1])
result = []
for i, fi in enumerate(f):
    x = np.random.uniform(0, 1, 100)
    result.append({
        "task": i,
        "x": x,
        "f": fi(x),
        "y": fi(x) + np.random.normal(0, .1, len(x))
    })
# save data
data_df = pd.concat([pd.DataFrame(r) for r in result])
data_df.to_csv(working_path / "tasks.csv", index = False)
data_df = data_df.reset_index()
betas_df = np.hstack([np.arange(args["n_tasks"])[:, np.newaxis], np.array(zs)[:, np.newaxis], betas])
betas_df = pd.DataFrame(betas_df)
betas_df.columns = ["task", "cluster"] + [f"beta{i}" for i in range(betas.shape[1])]
betas_df.to_csv(working_path / "betas.csv", index = False)
data_dict = data_df.to_dict(orient = "list")

In [68]:
pd.DataFrame.from_dict({"x": val_x[:, 1], "y": val_y}).to_csv(working_path / "subset_data.csv")

In [129]:
input_data = prepare_input(data_dict,
                               target_task = 5,
                               target_test_size = args["target_test_size"],
                              preprocess = True)
pd.DataFrame.from_dict(input_data["data_dict"]).to_csv(working_path / "tasks_processed.csv",
                                                           index = False)

  processed = (raw - raw.min(axis = 0)) / (raw.max(axis = 0) - raw.min(axis = 0))


In [140]:
class lm():
    """
    Linear regression model
    """
    def __init__(self):
        self.model = LinearRegression()
    def initialize(self):
        self.model = LinearRegression()
        return self
    def prepare_data(self, x, y):
        if len(x.shape) <= 1:
            x = np.array([np.ones(x.shape), np.array(x)]).T
        return x, y
    def fit(self, x_train, y_train, loss_fn = None):
        self.model.fit(x_train, y_train)
        return self.model
    def evaluate(self, x_test, y_test, loss_fn):
        y_hat = self.model.predict(x_test)
        l = loss_fn(y_test, y_hat)
        return l
    def combine_with_old(self, model_old, decay_rate = .5):
        self.model.coef_ = decay_rate * model_old.coef_ + (1 - decay_rate) * self.model.coef_
        return self.model
    def pred(self, x_new):
        y_hat = self.model.predict(x_new)
        return y_hat
    def save(self, path = ".", x_new = None, y_new = None, para = True):
        path = Path(path)
        path.mkdir(parents = True, exist_ok = True)
        if para is True:
            pd.DataFrame.from_dict({"coef": self.model.coef_}).to_csv(path / Path("coef.csv"))
        if not x_new is None:
            pd.DataFrame.from_dict({"x": x_new[:, 1], "y": y_new, "y_hat": self.pred(x_new)}).to_csv(path / Path("fitted.csv"))
        return para

In [141]:
model = lm()
n_it = 100
batch_size = 8
decay_rate = .5
conservative = False

In [142]:
bandit_selects = [None]
# initialize hyperparameters
alpha = dict.fromkeys(input_data["source_task"], [1])
beta = dict.fromkeys(input_data["source_task"], [1])
pi = dict.fromkeys(input_data["source_task"], [0])

mod = model
val_x, val_y = mod.prepare_data(input_data["X_target_val"], input_data["y_target_val"])

# initialize model from target training data
X_current, y_current = mod.prepare_data(input_data["X_target_train"], input_data["y_target_train"])
mod.fit( X_current, y_current)
l = mod.evaluate(val_x, val_y, loss_fn = loss_fn)
losses = [l]
model_old = mod.model

for t in range(n_it):
    # select bandit
    bandit_current, pi = get_bandit(input_data, alpha, beta,t, pi)
    bandit_selects.append(bandit_current)
    # set training data at this iteration
    X_current, y_current, _ = subset_data(input_data["source_dict"], 
                               key_value = bandit_current,
                               key_name = "task", test_size = 0)
    batch_id = random.choices(list(range(0, len(y_current))), k = batch_size)
    X_current, y_current = X_current[batch_id, :], y_current[batch_id]

    X_current = np.concatenate((X_current, input_data["X_target_val"]), axis = 0)
    y_current = np.concatenate((y_current, input_data["y_target_val"]), axis = 0)
    X_current, y_current = mod.prepare_data(X_current, y_current)
    # train model
    #mod_train = model.fit(X_current, y_current)
    mod = model.initialize()
    mod.fit(X_current, y_current, loss_fn = loss_fn)
    # combine parameters with previous model
    mod.combine_with_old(model_old, decay_rate = decay_rate)
    # evaluate model
    l = mod.evaluate(val_x, val_y, loss_fn = loss_fn)
    losses += [l]
    model_old = mod.model
    # update bandit parameters
    if conservative:
        thres = 100000
    else:
        thres = avg_loss(bandit_selects, losses, bandit_current)
    alpha, beta = update_hyper_para(alpha, beta, t, losses,
                                    bandit_current,
                                    thres = thres
                                   )
    mod.save(path = working_path / ("current" + str(t)), x_new = X_current, y_new = y_current, para = True)
    mod.save(path = working_path / str(t), x_new = val_x, y_new = val_y, para = True)

In [135]:
save_files(working_path, alpha, beta, losses, bandit_selects, pi, pi)