In [1]:
import os
import json
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
data_dir = '/SSL_NAS/benchmark_data/GQA/'
path = os.path.join(data_dir, 'sceneGraphs')
path_nodes = f'{path}/nodes'
path_edges = f'{path}/edges'
data = pd.read_csv(os.path.join(data_dir, "questions.csv"))
data

Unnamed: 0,q_id,image_id,question,answer,full_answer
0,14145991,369,Are the white blinds to the right of a clock?,yes,"Yes, the blinds are to the right of a clock."
1,16906415,2379051,Which kind of furniture is to the left of the ...,nightstand,The piece of furniture is a nightstand.
2,294499,1592605,Are there any bikes or cars?,yes,"Yes, there is a bike."
3,7125126,2414837,What is the man guiding?,bicycle,The man is guiding the bicycle.
4,13561987,2362341,What is the avocado on?,counter,The avocado is on the counter.
...,...,...,...,...,...
99995,8317994,2320400,Are there either life vests or cages?,no,"No, there are no cages or life vests."
99996,15436430,2357817,Do you see trains or windows?,yes,"Yes, there is a window."
99997,11475273,2410206,What kind of device is the man with the scarf ...,cell phone,The man is holding the cell phone.
99998,1879495,2315406,"Where is it, the bathroom or the dining room?",bathroom,It is the bathroom.


In [None]:
def textualize_graph(data):
    # mapping from object id to index
    objectid2nodeid = {object_id: idx for idx, object_id in enumerate(data['objects'].keys())}
    nodes = []
    edges = []
    for objectid, object in data['objects'].items():
        # nodes
        node_attr = f'name: {object["name"]}'
        x, y, w, h = object['x'], object['y'], object['w'], object['h']
        if len(object['attributes']) > 0:
            node_attr = node_attr + '; attribute: ' + (', ').join(object["attributes"])
        node_attr += '; (x,y,w,h): ' + str((x, y, w, h))
        nodes.append({'node_id': objectid2nodeid[objectid], 'node_attr': node_attr})

        # edges
        for rel in object['relations']:
            src = objectid2nodeid[objectid]
            dst = objectid2nodeid[rel['object']]
            edge_attr = rel['name']
            edges.append({'src': src, 'edge_attr': edge_attr, 'dst': dst})

    return nodes, edges


def preprocess():
    dataset = json.load(open('/SSL_NAS/benchmark_data/GQA/sceneGraphs/train_sceneGraphs.json'))

    os.makedirs(path_nodes, exist_ok=True)
    os.makedirs(path_edges, exist_ok=True)

    for imageid, object in tqdm(dataset.items(), total=len(dataset)):
        node_attr, edge_attr = textualize_graph(object)
        pd.DataFrame(node_attr, columns=['node_id', 'node_attr']).to_csv(f'{path_nodes}/{imageid}.csv', index=False)
        pd.DataFrame(edge_attr, columns=['src', 'edge_attr', 'dst']).to_csv(f'{path_edges}/{imageid}.csv', index=False)

preprocess()

  0%|          | 0/74942 [00:00<?, ?it/s]

  0%|          | 0/46514 [00:00<?, ?it/s]

In [8]:
# scene_graphs = {}
# for image_id in tqdm(data['image_id'].unique()):
#     node_vocab = {}
#     node_df = pd.read_csv(f'{path_nodes}/{image_id}.csv')
#     triple_df = pd.read_csv(f'{path_edges}/{image_id}.csv')
#     scene_graph = ""
#     for idx, row in node_df.iterrows():
#         node_vocab[row['node_id']] = row['node_attr']
#     for idx, row in triple_df.iterrows():
#         src = node_vocab[row['src']].split(";")[0].split(":")[-1].strip()
#         dst = node_vocab[row['dst']].split(";")[0].split(":")[-1].strip()
#         scene_graph += f"{src} {row['edge_attr']} {dst}\n"
#     scene_graphs[str(image_id)] = scene_graph

scene_graphs = {}
for image_id in tqdm(data['image_id'].unique()):
    node_vocab = {}
    node_df = pd.read_csv(f'{path_nodes}/{image_id}.csv')
    triple_df = pd.read_csv(f'{path_edges}/{image_id}.csv')

    scene_graph = "node_id, node_attr\n"
    for idx, row in node_df.iterrows():
        node_name = row['node_attr'].split(";")[0].split(":")[-1].strip()
        scene_graph += f"{row['node_id']}, {node_name}\n"
    scene_graph += "\n"

    scene_graph += "src, edge_attr, dst\n"
    for idx, row in triple_df.iterrows():
        scene_graph += f"{row['src']}, {row['edge_attr']}, {row['dst']}\n"
    scene_graph += "\n"
    scene_graphs[str(image_id)] = scene_graph
    

file_path = "/SSL_NAS/benchmark_data/GQA/sceneGraphs/scene_graphs_name.json"
with open(file_path, "w") as f:
    json.dump(scene_graphs, f)

  0%|          | 0/46514 [00:00<?, ?it/s]

In [9]:
with open(file_path, "r") as f:
    scene_graphs = json.load(f)

In [4]:
from sklearn.model_selection import train_test_split
import numpy as np

def generate_split():
    # Load the data
    questions = pd.read_csv(f"{data_dir}/questions.csv")

    # Create a unique list of image IDs
    unique_image_ids = questions['image_id'].unique()

    # Shuffle the image IDs
    np.random.seed(42)  # For reproducibility
    shuffled_image_ids = np.random.permutation(unique_image_ids)

    # Split the image IDs into train, validation, and test sets
    train_ids, temp_ids = train_test_split(shuffled_image_ids, test_size=0.4, random_state=42)  # 60% train, 40% temporary
    val_ids, test_ids = train_test_split(temp_ids, test_size=0.5, random_state=42)  # Split the 40% into two 20% splits

    # Create a mapping from image ID to set label
    id_to_set = {image_id: 'train' for image_id in train_ids}
    id_to_set.update({image_id: 'val' for image_id in val_ids})
    id_to_set.update({image_id: 'test' for image_id in test_ids})

    # Map the sets back to the original DataFrame
    questions['set'] = questions['image_id'].map(id_to_set)

    # Create the final train, validation, and test DataFrames
    train_df = questions[questions['set'] == 'train']
    val_df = questions[questions['set'] == 'val']
    test_df = questions[questions['set'] == 'test']

    # Create a folder for the split
    os.makedirs(f'{path}/split', exist_ok=True)

    # Writing the indices to text files
    train_df.index.to_series().to_csv(f'{path}/split/train_indices.txt', index=False, header=False)
    val_df.index.to_series().to_csv(f'{path}/split/val_indices.txt', index=False, header=False)
    test_df.index.to_series().to_csv(f'{path}/split/test_indices.txt', index=False, header=False)

generate_split()