Aimed at testing functions in `simulate_module.py`

In [1]:
from simulate_module import *

import pandas as pd
from pathlib import Path
import numpy as np
np.random.seed(1234)


In [48]:
def subset_data(data_dict, key_name = "task", key_value = 0, test_size = 0.33):
    """
    Subsetting data by the value of a key.
    
    Parameters
    ---
    data_dict: dict
        the dictionary one wants to subset
    key_name: str
        the key one wants to subset on
    key_value: list / int / str
        the value of the key desirable in the output subset
    test_size: float
        how to split the resulting subset; if set to zero, then the output won't be splitted

    Returns
    ---
    
    """
    if type(data_dict[key_name]) == list:
        values = data_dict[key_name]
    else:
        values = list(data_dict[key_name].values())
    
    n_task = max(values) + 1    
    if type(key_value) != list:
        idx_task = [i for (i, v) in enumerate(values) if v == key_value]
    else:
        idx_task = [i for (i, v) in enumerate(values) if v in key_value]
        
    tasks = [data_dict['task'][i] for i in idx_task]
    
    x = [data_dict['x'][i] for i in idx_task]
    X = np.array([np.ones(len(idx_task)), np.array(x)]).T
    
    y = np.array([data_dict['y'][i] for i in idx_task])
    
    if test_size == 0:
        return X, y, tasks
    else:
        X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                        test_size = test_size,
                                                        random_state = 123)
    return X_train, X_test, y_train, y_test

In [49]:
X, y, tasks = subset_data(data_dict, key_value = [0,1], key_name = "task", test_size = 0)