In [1]:
import sys
sys.path.append("..")

import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Analysis

In [2]:
dataset_path = "../data/phd/dataset.json"
save_path = "../data/phd/phd_sampled.json"
sample_size = None

with open(dataset_path, "r", encoding="utf-8") as f:
    dataset = json.load(f)
if sample_size is not None and len(dataset) > sample_size:
    dataset = dataset[:sample_size]

df = pd.DataFrame(dataset)

# Drop css_description 
df = df[df["ccs_description"].isna()]
df = df.drop(columns=["ccs_description"])

print(f"Successfully load the Hallusion Bench dataset with: {len(df)} samples.")
df.head()

Successfully load the Hallusion Bench dataset with: 16844 samples.


Unnamed: 0,task,yes_question,no_question,context,image_id,hitem,subject,gt
0,counting,Are there three couches in the image?,Are there 2 couches in the image?,"{'icc': 'In a cozy living room setting, two co...",262207,2,couches,three
1,sentiment,Is the cat looking sad in the image?,Is the cat content in the image?,{'icc': 'The cat in the image appears to be co...,354493,content,cat,sad
2,counting,Are there four people in the image?,Are there exactly three people in the image?,"{'icc': 'In the image, there are exactly three...",102532,3,people,4
3,sentiment,Is the boy appearing happy in the image?,Is the boy displaying a mischievous expression...,"{'icc': 'In a delightful scene, the boy is dis...",99810,mischievous,boy,happy
4,positional,Is there an object located two hundred and thi...,Is there anything in front of the bike in the ...,{'icc': 'In the thrilling moment captured in t...,49683,0,front of the bike,two hundred and thirteen


In [3]:
task_counts = df["task"].value_counts()
task_counts

task
object        5736
attribute     3997
counting      2844
positional    2492
sentiment     1775
Name: count, dtype: int64

In [4]:
yes_answer_nunique = df["yes_question"].nunique()
no_answer_nunique = df["no_question"].nunique()

print("Yes answer unique count:", yes_answer_nunique)
print("No answer unique count:", no_answer_nunique)

Yes answer unique count: 6200
No answer unique count: 8326


## Dataset Construction

In [5]:
def sample_and_balance(group, sample_size=1500):
    n_sample = min(sample_size, len(group))
    
    # Create unique identifier excluding unhashable columns
    hashable_cols = ["yes_question", "no_question"]
    group["unique_id"] = range(len(group))  # Temporary unique ID
    
    # Sample maximizing unique yes/no question pairs
    unique_pairs = group.drop_duplicates(subset=hashable_cols)
    n_unique = min(n_sample, len(unique_pairs))
    group_sampled = unique_pairs.sample(n=n_unique, random_state=42)
    
    # Fill remaining if needed (no drop_duplicates to avoid dict error)
    remaining_needed = n_sample - len(group_sampled)
    if remaining_needed > 0:
        additional = group.sample(n=remaining_needed, random_state=42)
        group_sampled = pd.concat([group_sampled, additional]).reset_index(drop=True)
    group_sampled = group_sampled.drop(columns=["unique_id"], errors="ignore")
    
    # Balance yes/no labels
    n = len(group_sampled)
    yes_indices = np.random.choice(group_sampled.index, size=n//2, replace=False)
    no_indices = np.setdiff1d(group_sampled.index, yes_indices)
    
    group_sampled.loc[yes_indices, "question"] = group_sampled.loc[yes_indices, "yes_question"]
    group_sampled.loc[no_indices, "question"] = group_sampled.loc[no_indices, "no_question"]
    group_sampled["label"] = 0
    group_sampled.loc[yes_indices, "label"] = 1
    
    return group_sampled

In [6]:
df_sample = (df
             .groupby("task", group_keys=False)
             .apply(sample_and_balance)
             .reset_index(drop=True))

print(f"Shape: {df_sample.shape}")
print(f"Number of unique questions: {df_sample["question"].nunique()}")
print(f"Number of unique yes questions: {df_sample["yes_question"].nunique()}")
print(f"Number of unique no questoins: {df_sample["no_question"].nunique()}")
print("\n", df_sample["label"].value_counts(normalize=True).round(3))
print("\n", df_sample["task"].value_counts(normalize=True).round(3))
print("\n", df_sample.groupby("task")["label"].value_counts(normalize=True).round(3).unstack(fill_value=0))
df_sample.head()

Shape: (7500, 10)
Number of unique questions: 5163
Number of unique yes questions: 4154
Number of unique no questoins: 5440

 label
1    0.5
0    0.5
Name: proportion, dtype: float64

 task
attribute     0.2
counting      0.2
object        0.2
positional    0.2
sentiment     0.2
Name: proportion, dtype: float64

 label         0    1
task                
attribute   0.5  0.5
counting    0.5  0.5
object      0.5  0.5
positional  0.5  0.5
sentiment   0.5  0.5


  .apply(sample_and_balance)


Unnamed: 0,task,yes_question,no_question,context,image_id,hitem,subject,gt,question,label
0,attribute,Are the shirts in the image white?,Is any of the shirts in the image gray?,"{'icc': 'In a lively living room setting, two ...",452966,gray,shirts,white,Are the shirts in the image white?,1
1,attribute,Is the monitor screen rectangular in shape?,Is the monitor screen circular in shape?,{'icc': 'The monitor screen is circular in sha...,198641,circular,monitor screen,rectangle,Is the monitor screen rectangular in shape?,1
2,attribute,Is the vase made of the same material as the f...,Is the floor made of acrylic in the image?,{'icc': 'The floor in the image is made of acr...,494991,acrylic,floor,vase,Is the vase made of the same material as the f...,1
3,attribute,Is the woman wearing black in the image?,Is the woman wearing white in the image?,{'icc': 'The woman in the image is wearing whi...,10966,white,woman,black,Is the woman wearing white in the image?,0
4,attribute,Is the back wall made of brick in the image?,Is the back wall made of concrete in the image?,"{'icc': 'In a recent event, two men took the s...",50148,Concrete,back wall,brick,Is the back wall made of concrete in the image?,0


In [7]:
df_sample = df_sample.reset_index().rename(columns={"index": "id"})
output = df_sample.to_dict("records")

with open(save_path, "w") as f:
    json.dump(output, f, indent=4)