In [10]:
from datasets import (
    get_dataset_config_names,
    concatenate_datasets,
    Dataset,
    load_dataset,
    Image,
)
import PIL
import io
from huggingface_hub import login
import os
import pandas as pd


all_configs = get_dataset_config_names("CharlyR/varbench-evaluation")

all_datasets: list[Dataset] = []

for config in all_configs:
    conf_ds = load_dataset("CharlyR/varbench-evaluation", config, split="tikz")
    config_name_column = [config] * len(conf_ds)
    all_datasets.append(conf_ds.add_column("config", config_name_column))


concat_datasets = concatenate_datasets(
    all_datasets
)  # works because all the metrics are the same, might eventually need to adapt it when more metrics are computed


concat_df: pd.DataFrame = concat_datasets.to_pandas()

concat_df = concat_df.explode(
    [
        col_name
        for col_name in concat_df.columns
        if "Metric" in col_name and not "best" in col_name
    ]
    + ["images_result", "original_predictions", "predictions", "predictions_patches"]
)

concat_df = concat_df.dropna()

concat_datasets: Dataset = Dataset.from_pandas(concat_df)


concat_datasets = (
    concat_datasets.cast_column("image_solution", Image(decode=True))
    .cast_column("images_result", Image(decode=True))
    .cast_column("image_input", Image(decode=True))
)


concat_datasets.push_to_hub("CharlyR/varbench-metric-evaluation", config_name="raw")

Map: 100%|██████████| 221/221 [00:00<00:00, 11478.01 examples/s]/s]
Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 212.27ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:01<00:00,  1.99s/it]


CommitInfo(commit_url='https://huggingface.co/datasets/CharlyR/varbench-metric-evaluation/commit/aedf93ec1f46fc48cfd592a774daf407df8b2bfa', commit_message='Upload dataset', commit_description='', oid='aedf93ec1f46fc48cfd592a774daf407df8b2bfa', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CharlyR/varbench-metric-evaluation', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CharlyR/varbench-metric-evaluation'), pr_revision=None, pr_num=None)

In [14]:
#WARNING:Removes the existing one
# Create an empty treated dataset (or filter existing data)
treated_df = concat_df.iloc[:0]  # Keeps the structure but removes rows

# Convert to Hugging Face Dataset
treated_dataset = Dataset.from_pandas(treated_df)
treated_dataset.push_to_hub("CharlyR/varbench-metric-evaluation", config_name="treated")

Creating parquet from Arrow format: 0ba [00:00, ?ba/s]:00<?, ?it/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  3.83it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/CharlyR/varbench-metric-evaluation/commit/4ce38520d688f618112921197159973b742d4038', commit_message='Upload dataset', commit_description='', oid='4ce38520d688f618112921197159973b742d4038', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/CharlyR/varbench-metric-evaluation', endpoint='https://huggingface.co', repo_type='dataset', repo_id='CharlyR/varbench-metric-evaluation'), pr_revision=None, pr_num=None)

In [6]:
concat_datasets.features

{'id': Value(dtype='string', id=None),
 'code': Value(dtype='string', id=None),
 'instruction': Value(dtype='string', id=None),
 'result_description': Value(dtype='string', id=None),
 'difficulty': Value(dtype='string', id=None),
 'patch': Value(dtype='string', id=None),
 'code_solution': Value(dtype='string', id=None),
 'image_solution': {'bytes': Value(dtype='binary', id=None),
  'path': Value(dtype='null', id=None)},
 'image_input': {'bytes': Value(dtype='binary', id=None),
  'path': Value(dtype='null', id=None)},
 'original_predictions': Value(dtype='string', id=None),
 'predictions': Value(dtype='string', id=None),
 'image_result_indexes': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'images_result': {'bytes': Value(dtype='binary', id=None),
  'path': Value(dtype='null', id=None)},
 'parsing_score': Value(dtype='float64', id=None),
 'compiling_score': Value(dtype='float64', id=None),
 'predictions_patches': Value(dtype='string', id=None),
 'LPIPSMetric': V

In [12]:
concat_df[concat_df["id"] == "donkey_higher_mane"]["config"]


0      simpleLLM_benchmark_deepseekr1distillllama70b_...
50      simpleLLM_benchmark_llama3.18binstant_pk_1_t_0.7
100    simpleLLM_benchmark_llama3.370bversatile_pk_1_...
150         simpleLLM_benchmark_llama370b8192_pk_1_t_0.7
200      simpleLLM_benchmark_mixtral8x7b32768_pk_1_t_0.7
Name: config, dtype: object