In [12]:
import os
import ijson
import glob
import re 

def natural_key(s):
    return [int(t) if t.isdigit() else t.lower() for t in re.split(r'(\d+)', s)]

root_dir = ["/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random", "/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary"]

simulation_roots = root_dir
list_simulations = []

for simulation_root in simulation_roots:
    pattern = os.path.join(simulation_root, '**', 'simulation.json')    

    print("Searching for simulation files with pattern:", pattern)        
    for sim_file in glob.glob(pattern, recursive=True):
        list_simulations.append(sim_file)

list_simulations.sort(key=natural_key)

print(f"Found {len(list_simulations)} simulation files.")


# a fare bene dovrei controllare se nell'oggetto della domanda ci sono oggetti nella simulazione con lo stesso nome

simulation_cache = {}
total_duplicates = 0
for sim_file in list_simulations:
    with open(sim_file, 'r') as f:
        # print(f"Processing simulation file: {sim_file}")
        # iterate over each key-value pair in "objects"
        duplicated_names = {}
        for key, obj in ijson.kvitems(f, 'objects'):
            name = obj.get('model', {})
            if name:
                if name not in duplicated_names:
                    duplicated_names[name] = 1
                else:
                    duplicated_names[name] += 1

        # Store the duplicated names for this simulation
        simulation_cache[sim_file] = duplicated_names
        for name, count in duplicated_names.items():
            if count > 1:
                total_duplicates += 1
                print(f"Simulation file {sim_file} has multiple objects with the same name '{name}': {count} instances.")

print('number of simulations with duplicated object names:')
print(total_duplicates)

Searching for simulation files with pattern: /data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random/**/simulation.json
Searching for simulation files with pattern: /data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/**/simulation.json
Found 1282 simulation files.
Simulation file /data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/2/c-1_no-2_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-71_20251103_115621/simulation.json has multiple objects with the same name 'JUNGLE_HEIGHT': 2 instances.
Simulation file /data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/3/c-1_no-3_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-71_20251103_122827/simulation.json has multiple objects with the same name 'JUNGLE_HEIGHT': 2 instances.
Simulation file /data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/3/c-1_no-3_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-91_20251103_1243

In [2]:
print(simulation_cache)

{'/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/1/c-1_no-1_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-0_20251102_163324/simulation.json': {'Pepsi_Max_Cola_Zero_Calorie_12_12_fl_oz_355_ml_cans_144_fl_oz_426_lt': 1}, '/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/1/c-1_no-1_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-1_20251102_163634/simulation.json': {'Dino_3': 1}, '/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/1/c-1_no-1_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-2_20251102_163727/simulation.json': {'Schleich_Therizinosaurus_ln9cruulPqc': 1}, '/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/1/c-1_no-1_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-3_20251102_163747/simulation.json': {'Mens_ASV_Billfish_Boat_Shoe_in_Tan_Leather_wmUJ5PbwANc': 1}, '/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationar

In [13]:
import json
questions_path = "/data0/sebastian.cavada/compositional-physics/tiny_vqa_deterministic/output/run_08_general/test_run_08_general_10K.json"

with open(questions_path, 'r') as f:
    questions = json.load(f)

In [14]:
gso_mapping['JUNGLE_HEIGHT']

{'name': 'foldable activity scroll for baby'}

In [15]:
names = [gso_mapping[key]['name'] for key, value in simulation_cache["/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random/2/c-1_no-2_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-88_20251103_010246/simulation.json"].items()]
print(names)

['boxed Trivial Pursuit board game', 'brown Timberland shoe']


In [22]:
count = 0
count_per_question_id = {}

for question in questions:
    simulation_id = question['simulation_id']
    if simulation_id in simulation_cache:
        # print(f"Checking simulation ID: {simulation_id}")

        # print(simulation_cache[simulation_id])

        names = [gso_mapping[key]['name'] for key, value in simulation_cache[simulation_id].items() if value > 1]
        if len(names) > 0:
            print(names)
            print(simulation_id)            

            for name in names:
                print(name)

                if name in question['question']:
                    print(f"Found matching name in question: {name}")
                    if question['question_id'] not in count_per_question_id:
                        count_per_question_id[question['question_id']] = 0
                    count_per_question_id[question['question_id']] += 1
                    count += 1
                    break

    else:
        print(f"Simulation ID {simulation_id} not found in simulation cache.")

print(f"Total count of matching names: {count}")
print("Percentage:", (count / len(questions)) * 100)
print("Count per question ID:", count_per_question_id)
print(len(count_per_question_id))

['light gray file organizer rack']
/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random/6/c-1_no-6_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-22_20251102_090457/simulation.json
light gray file organizer rack
['256-piece LEGO Fusion boxed set']
/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/3/c-1_no-3_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-91_20251103_124335/simulation.json
256-piece LEGO Fusion boxed set
['256-piece LEGO Fusion boxed set']
/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/3/c-1_no-3_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-91_20251103_124335/simulation.json
256-piece LEGO Fusion boxed set
['foldable activity scroll for baby']
/data0/sebastian.cavada/datasets/simulations_v3/dl3dv/random-cam-stationary/5/c-1_no-5_d-4_s-dl3dv-all_models-hf-gso_MLP-10_smooth_h-10-40_seed-71_20251103_135800/simulation.json
foldable activity scroll for baby
['foldable activity scrol

In [23]:
# thus the the check for duplicate objects might not be working  as intended
print(len(questions))

10000
