This notebook is meant to generate a Hugging Face dataset containing all of the required data to run the benchmark and to compute the RCRPS from samples.

In [None]:
from cik_benchmark import ALL_TASKS, TASK_NAME_TO_WEIGHT
from cik_benchmark.metrics.constraints import ListConstraint, MaxConstraint, MinConstraint, VariableMaxConstraint
from cik_benchmark.metrics.scaling_cache import DefaultScalingCache

from datasets import Dataset, DatasetDict, Features, Value, Sequence

In [None]:
def roi_to_list(region_of_interest):
    if region_of_interest is None:
        return []
    elif isinstance(region_of_interest, slice):
        return list(range(region_of_interest.start, region_of_interest.stop, region_of_interest.step or 1))
    elif isinstance(region_of_interest, list):
        return region_of_interest
    else:
        raise ValueError(f"Unexpected type for region_of_interest: {type(region_of_interest)}")
    
def constraints_to_entries(metric_constraint):
    min_constraint = -float("inf")
    max_constraint = float("inf")
    variable_max_constraint_index = []
    variable_max_constraint_values = []

    if isinstance(metric_constraint, ListConstraint):
        for constraint in metric_constraint.constraints:
            if isinstance(constraint, MinConstraint):
                min_constraint = constraint.threshold
            elif isinstance(constraint, MaxConstraint):
                max_constraint = constraint.threshold
            elif isinstance(constraint, VariableMaxConstraint):
                variable_max_constraint_index = constraint.indices.tolist()
                variable_max_constraint_values = constraint.thresholds.tolist()
    elif isinstance(metric_constraint, MinConstraint):
        min_constraint = metric_constraint.threshold
    elif isinstance(metric_constraint, MaxConstraint):
        max_constraint = metric_constraint.threshold
    elif isinstance(metric_constraint, VariableMaxConstraint):
        variable_max_constraint_index = metric_constraint.indices.tolist()
        variable_max_constraint_values = metric_constraint.thresholds.tolist()
    
    return min_constraint, max_constraint, variable_max_constraint_index, variable_max_constraint_values

In [None]:
instances_dict = {
    "name": [],
    "seed": [],
    "weight": [],
    "context_sources": [],
    "skills": [],
    "background": [],
    "scenario": [],
    "constraints": [],
    "seasonal_period": [],
    "past_time": [],
    "future_time": [],
    "metric_scaling": [],
    "region_of_interest": [],
    "constraint_min": [],
    "constraint_max": [],
    "constraint_variable_max_index": [],
    "constraint_variable_max_values": [],
}

for task in ALL_TASKS:
    for seed in range(1, 6):
        instance = task(seed=seed)

        min_constraint, max_constraint, variable_max_constraint_index, variable_max_constraint_values = constraints_to_entries(instance.metric_constraint)

        instances_dict["name"].append(instance.name)
        instances_dict["seed"].append(seed)
        instances_dict["weight"].append(str(TASK_NAME_TO_WEIGHT[instance.name]))
        instances_dict["context_sources"].append(instance._context_sources)
        instances_dict["skills"].append(instance._skills)
        instances_dict["background"].append(instance.background or "")
        instances_dict["scenario"].append(instance.scenario or "")
        instances_dict["constraints"].append(instance.constraints or "")
        instances_dict["seasonal_period"].append(instance.seasonal_period)
        instances_dict["past_time"].append(instance.past_time[[instance.past_time.columns[-1]]].to_json(date_format="iso"))
        instances_dict["future_time"].append(instance.future_time[[instance.future_time.columns[-1]]].to_json(date_format="iso"))
        instances_dict["metric_scaling"].append(DefaultScalingCache(task))
        instances_dict["region_of_interest"].append(roi_to_list(instance.region_of_interest))
        instances_dict["constraint_min"].append(min_constraint)
        instances_dict["constraint_max"].append(max_constraint)
        instances_dict["constraint_variable_max_index"].append(variable_max_constraint_index)
        instances_dict["constraint_variable_max_values"].append(variable_max_constraint_values)

features = Features(
    name=Value(dtype='string', id=None),
    seed=Value(dtype='int64', id=None),
    weight=Value(dtype='string', id=None),
    context_sources=Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
    skills=Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
    background=Value(dtype='string', id=None),
    scenario=Value(dtype='string', id=None),
    constraints=Value(dtype='string', id=None),
    seasonal_period=Value(dtype='int64', id=None),
    past_time=Value(dtype='string', id=None),
    future_time=Value(dtype='string', id=None),
    metric_scaling=Value(dtype='float64', id=None),
    region_of_interest=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
    constraint_min=Value(dtype='float64', id=None),
    constraint_max=Value(dtype='float64', id=None),
    constraint_variable_max_index=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
    constraint_variable_max_values=Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None),
)

ds = Dataset.from_dict(instances_dict, features=features)

In [None]:
ds_dict = DatasetDict({"test": ds})
ds_dict.push_to_hub("ServiceNow/context-is-key", private=True)

Add a JSON version of the tasks in the repository

In [None]:
ds.to_json("../results/all_tasks.json")