Aimed at testing functions in `simulate_module.py`

In [1]:
from simulate_module import *

import pandas as pd
from pathlib import Path
import numpy as np
np.random.seed(1234)


In [2]:
n_tasks = 15
np.random.seed(1234)
f, betas, zs = random_functions(n_tasks, 6, 1, .1)
result = []

for i, fi in enumerate(f):
    x = np.random.uniform(0, 1, 100)
    result.append({
        "task": i,
        "x": x,
        "f": fi(x),
        "y": fi(x) + np.random.normal(0, .1, len(x))
    })

data_df = pd.concat([pd.DataFrame(r) for r in result])
data_df = data_df.reset_index()


betas_df = np.hstack([np.arange(n_tasks)[:, np.newaxis], np.array(zs)[:, np.newaxis], betas])
betas_df = pd.DataFrame(betas_df)
betas_df.columns = ["task", "cluster"] + [f"beta{i}" for i in range(betas.shape[1])]
# relationship between tasks (bandits) and their original clusters
d = dict.fromkeys(betas_df.cluster, [])
for k, v in zip(betas_df.cluster, betas_df.task):
    d[k] = d[k] +[v]

data_dict = data_df.to_dict()
# add key "cluster" to `data_dict`
data_dict["cluster"] = []

for task in data_dict["task"].values():
    cluster = get_key(d, task)
    if(cluster == "There is no such key"):
        print("task = " + str(task))
        break
    data_dict["cluster"].append(cluster)

In [17]:
type(0.33)

float

In [48]:
def subset_data(data_dict, key_name = "task", key_value = 0, test_size = 0.33):
    """
    Subsetting data by the value of a key.
    
    Parameters
    ---
    data_dict: dict
        the dictionary one wants to subset
    key_name: str
        the key one wants to subset on
    key_value: list / int / str
        the value of the key desirable in the output subset
    test_size: float
        how to split the resulting subset; if set to zero, then the output won't be splitted

    Returns
    ---
    
    """
    if type(data_dict[key_name]) == list:
        values = data_dict[key_name]
    else:
        values = list(data_dict[key_name].values())
    
    n_task = max(values) + 1    
    if type(key_value) != list:
        idx_task = [i for (i, v) in enumerate(values) if v == key_value]
    else:
        idx_task = [i for (i, v) in enumerate(values) if v in key_value]
        
    tasks = [data_dict['task'][i] for i in idx_task]
    
    x = [data_dict['x'][i] for i in idx_task]
    X = np.array([np.ones(len(idx_task)), np.array(x)]).T
    
    y = np.array([data_dict['y'][i] for i in idx_task])
    
    if test_size == 0:
        return X, y, tasks
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size = test_size,
                                                        random_state = 123)
    return X_train, X_test, y_train, y_test

In [49]:
X, y, tasks = subset_data(data_dict, key_value = [0,1], key_name = "task", test_size = 0)