In [18]:
from datasets import (
    get_dataset_config_names,
    concatenate_datasets,
    Dataset,
    load_dataset,
    Image,
)
import PIL
import io
from huggingface_hub import login
import os
import pandas as pd

conf_ds = load_dataset(
    "CharlyR/varbench-evaluation",
    "simpleLLM_benchmark_deepseekr1distillllama70b_pk_1_t_0.7",
    split="tikz",
)
conf_ds = conf_ds.select_columns(
    [
        "code_solution",
        "id",
        "images_result",
        "image_result_indexes",
        "predictions_patches",
        "predictions",
        "ImageEqualityMetric",
        "TemplateMetric",
    ]
).select([5])

In [19]:

from datasets import (
    get_dataset_config_names,
    concatenate_datasets,
    Dataset,
    load_dataset,
    Image,
)
import PIL
import io
from huggingface_hub import login
import os
import pandas as pd

def _extend_metric_computations(dataset: Dataset) -> Dataset:
    """The image-based metrics in the dataset are only computed for x out of y code generated, because some of the code can't compile.
    During the compiling(method _images), we compute the indexes of the images that did compute and put it in an array.
    This method takes as input the dataset, find the names of the columns that contains image-based metrics, and extends the computed
    list with Nones in the places where the code could not render(be compiled into) an image
    """

    metrics_names = [name for name in dataset.column_names if "Metric" in name]
    potential_image_metrics_names = [
        name
        for name in metrics_names
        if any(
            len(val) < len(parsed) 
            for row, parsed in zip(dataset[name], dataset["predictions_patches"]) 
            for val in row)
    ]  # named potential because if all images have been compiled without error we skip the process completely

    def _ext_none(row, col_name: str):
        "Extends the row with nones at unreferenced indexes"
        initial = [None] * len(row["predictions_patches"])
        for index, ar_value in zip(row["image_result_indexes"], row[col_name]):
            initial[index] = ar_value
        row[col_name] = initial
        return row

    def _ext_none_metric(row, col_name: str):
        "Extends the row with nones at unreferenced indexes"
        initial = [[None] * len(row["predictions"])] * len(row["code_solution"])
        for ind, sub_eval in enumerate(row[col_name]):
            for index, ar_value in zip(row["image_result_indexes"], sub_eval):
                initial[ind][index] = ar_value
        row[col_name] = initial
        return row

    dataset = dataset.map(_ext_none, fn_kwargs={"col_name": "images_result"})
    for metric_name in potential_image_metrics_names:
        dataset = dataset.map(_ext_none_metric, fn_kwargs={"col_name": metric_name})
    return dataset

conf_ds = _extend_metric_computations(conf_ds)

Map: 100%|██████████| 1/1 [00:00<00:00, 129.00 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 127.11 examples/s]


In [20]:
conf_ds[0]

{'code_solution': ['\\documentclass[tikz,border=5]{standalone}\n\\usepackage[prefix=]{xcolor-material}\n\\tikzset{\nhalf clip/.code={\n\\clip (0, -256) rectangle (256, 256);\n},\ncolor alias/.code args={#1 as #2}{\\colorlet{#1}{#2}},\ncolors alias/.style={color alias/.list/.expanded={#1}},\nexecute/.code={#1},\non left/.style={.. on left/.style={#1}},\non right/.style={.. on right/.style={#1}},\n}\n\\newcommand\\reflect[2][]{\n\\begin{scope}[#1]\\foreach \\side in {-1, 1}{\\begin{scope}\n\\ifnum\\side=-1 \\tikzset{.. on left/.try}\\else\\tikzset{.. on right/.try}\\fi\n\\begin{scope}[xscale=\\side]#2\\end{scope}\n\\end{scope}}\\end{scope}}\n\\tikzset{\nbee/.pic={\n\\begin{scope}[x=3cm/480,y=3cm/480, rotate=-45, shift=(270:48)]\n\\reflect[\non left= {colors alias={body as BlueGrey800, stripes as Amber500}},\non right={colors alias={body as BlueGrey900, stripes as Amber700}, half clip},\nlower wing/.style={fill=BlueGrey200}, upper wing/.style={fill=BlueGrey50}, middle wing/.style={fill=Bl

In [None]:
from collections import Counter
for idx, row in concat_df.iterrows():
    # Compute lengths for each column in the row
    lengths = {col: len(row[col]) for col in concat_df[computed_metrics_names + ["images_result","predictions", "predictions_patches","passk_index"]].columns}
    # Determine the most common (expected) length in this row
    common_length = len(row["predictions_patches"])
    # Identify columns where the array length deviates
    for col, arr_length in lengths.items():
        if arr_length != common_length:
            print(f"Row {idx}: Column '{col}' length {arr_length} (expected {common_length}).")