In [2]:
import os 
os.environ['COCO_DIR'] = '/usr1/data/mingqia2/datasets/coco/'
os.environ['AOKVQA_DIR'] = '/usr1/data/mingqia2/aokvqa/'
os.environ['HF_HOME'] = '/usr1/data/models_cache'

import json
from collections import Counter

In [7]:
def get_coco_path(split, image_id, coco_dir):
    return os.path.join(coco_dir, f"{split}2017", f"{image_id:012}.jpg")

In [8]:
def convert_to_llava_format(input_path, output_path, coco_dir, split='train'):
    # Load the original dataset
    with open(input_path, 'r') as f:
        data = json.load(f)

    converted = []
    for entry in data:
        question_id = entry.get("question_id")
        image_id = entry.get("image_id")
        question = entry.get("question", "")
        rationales = " ".join(entry.get("rationales", []))
        viper_gpt = entry.get("viper_gpt", {})
        viper_response = viper_gpt.get("viper_response", "")
        # choices = entry.get("choices", [])
        
        image_path = get_coco_path(split, image_id, coco_dir)
        # Find the most frequent direct answer
        direct_answers = entry.get("direct_answers", [])
        if direct_answers:
            freq = Counter(direct_answers)
            most_common_answer, _ = freq.most_common(1)[0]
        else:
            most_common_answer = ""

        da_prompt = (
            f"<image>\nQuestion: {question}\n"
            + f"\nVisual Clues: {viper_response}\n"   
            "Please provide a rationale and then return the letter of the correct answer in the format: "
            "'Rationale: [your explanation] \\n Answer: [your answer]'." 
        )
        da_answer = f"Rationales: {rationales} \\n Answer: {most_common_answer}"
            
        formatted_choices = ', '.join([f"{chr(65 + i)}: {choice}" for i, choice in enumerate(entry['choices'])])
        correct_choice = chr(65 + entry["correct_choice_idx"]) # Convert index to letter
       
        mc_prompt = (
            f"Question: {question} \n Visual Clues: {viper_response} \n Choices: {formatted_choices}. "
            "Please provide a rationale and then return the letter of the correct answer in the format: "
            "'Rationale: [your explanation] \\n Answer: [your answer]'."
        )
        mc_answer = f"Rationales: {rationales} \\n Answer: {correct_choice}"
        
        da_query = {
            "id": question_id,
            "image": image_path,
            "conversations": [
                {
                    "from": "human",
                    "value": da_prompt
                },
                {
                    "from": "gpt",
                    "value": da_answer
                }
            ]
        }
        
        mc_query = {
            "id": question_id,
            "image": image_path,
            "conversations": [
                {
                    "from": "human",
                    "value": mc_prompt
                },
                {
                    "from": "gpt",
                    "value": mc_answer
                }
            ]
        }
        converted.append(da_query)
        converted.append(mc_query)
    
    with open(output_path, 'w') as f:
        json.dump(converted, f, indent=2)

In [5]:
coco_dir = os.getenv('COCO_DIR')
aokvqa_dir = os.getenv('AOKVQA_DIR')

original_train = "../results/viper_augmentations/aokvqa_plus_viper_train.json"
converted_train = "../results/viper_augmentations/llava_train.json"

In [9]:
convert_to_llava_format(original_train, converted_train, coco_dir, split='train')