In [1]:
import os
os.chdir("../")
print(os.getcwd())

/data1/xhuan192/codes/med-vlrm


First fix the key in one json file:
```bash
python scripts/fix_key_in_chest_ct_scan.py
```

In [35]:
import json
from pathlib import Path
from pprint import pformat
from collections import Counter, defaultdict
import numpy as np
import pandas as pd

In [3]:
vqa_json_dir = Path("data/OmniMedVQA/QA_information/Open-access")
# vqa_json_dir = Path("data/OmniMedVQA/QA_information/Restricted-access")

In [4]:
vqa_json_list = list(vqa_json_dir.glob("*.json"))
print(f"Number of VQA JSON files: {len(vqa_json_list)}")

Number of VQA JSON files: 42


In [14]:
one_vqa_json = vqa_json_list[0]
print(f"Example VQA JSON file: {one_vqa_json}")

with open(one_vqa_json, "r") as f:
    data = json.load(f)
print(f"Number of samples: {len(data)}")
print(f"One sample: {pformat(data[0])}")

Example VQA JSON file: data/OmniMedVQA/QA_information/Open-access/Chest CT Scan.json
Number of samples: 871
One sample: {'dataset': 'Chest CT Scan',
 'gt_answer': 'CT',
 'image_path': 'Images/Chest CT '
               'Scan/test/adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib/000143 '
               '(6).png',
 'modality_type': 'CT(Computed Tomography)',
 'option_A': 'Angiography',
 'option_B': 'Electrocardiogram (ECG)',
 'option_C': 'Mammogram',
 'option_D': 'CT',
 'question': 'What imaging technique was employed for capturing this image?',
 'question_id': 'Chest CT Scan_0000',
 'question_type': 'Modality Recognition'}


In [15]:
# validate if all json are list of dictionaries
# validate all items have the same keys
template_keys = None
for i, vqa_json in enumerate(vqa_json_list):
    with open(vqa_json, "r") as f:
        data = json.load(f)
        if not isinstance(data, list):
            raise ValueError(f"JSON file {vqa_json} is not a list.")
        if not all(isinstance(item, dict) for item in data):
            raise ValueError(f"JSON file {vqa_json} does not contain all dictionaries.")

        if template_keys is None:
            template_keys = set(data[0].keys())
        keys = set(data[0].keys())
        if keys != template_keys:
            mismatched_keys = keys - template_keys
            missing_keys = template_keys - keys
            raise ValueError(f"JSON file {vqa_json} has mismatched keys: {mismatched_keys} or missing keys: {missing_keys}")
        # print(f"{vqa_json.stem}:\t{sorted(keys)}")

    print(f"Validated JSON file {i+1}/{len(vqa_json_list)}: {vqa_json}")

Validated JSON file 1/42: data/OmniMedVQA/QA_information/Open-access/Chest CT Scan.json
Validated JSON file 2/42: data/OmniMedVQA/QA_information/Open-access/SARS-CoV-2 CT-scan.json
Validated JSON file 3/42: data/OmniMedVQA/QA_information/Open-access/ISIC2019.json
Validated JSON file 4/42: data/OmniMedVQA/QA_information/Open-access/Adam Challenge.json
Validated JSON file 5/42: data/OmniMedVQA/QA_information/Open-access/OLIVES.json
Validated JSON file 6/42: data/OmniMedVQA/QA_information/Open-access/Mura.json
Validated JSON file 7/42: data/OmniMedVQA/QA_information/Open-access/ISBI2016.json
Validated JSON file 8/42: data/OmniMedVQA/QA_information/Open-access/Retinal OCT-C8.json
Validated JSON file 9/42: data/OmniMedVQA/QA_information/Open-access/ISIC2018.json
Validated JSON file 10/42: data/OmniMedVQA/QA_information/Open-access/Fitzpatrick 17k.json
Validated JSON file 11/42: data/OmniMedVQA/QA_information/Open-access/DeepDRiD.json
Validated JSON file 12/42: data/OmniMedVQA/QA_information

In [32]:
all_modality_types = set()
all_question_types = set()
for i, vqa_json in enumerate(vqa_json_list):
    with open(vqa_json, "r") as f:
        data = json.load(f)
        for item in data:
            all_modality_types.add(item["modality_type"])
            all_question_types.add(item["question_type"])
print(f"All modality types ({len(all_modality_types)}):\n{pformat(sorted(all_modality_types))}")
print(f"All question types ({len(all_question_types)}):\n{pformat(sorted(all_question_types))}")

All modality types (8):
['CT(Computed Tomography)',
 'Dermoscopy',
 'Fundus Photography',
 'MR (Mag-netic Resonance Imaging)',
 'Microscopy Images',
 'OCT (Optical Coherence Tomography',
 'X-Ray',
 'ultrasound']
All question types (5):
['Anatomy Identification',
 'Disease Diagnosis',
 'Lesion Grading',
 'Modality Recognition',
 'Other Biological Attributes']


In [33]:
def stat_types(data, all_modality_types, all_question_types):
    modality_type_count = {k: 0 for k in all_modality_types}
    question_type_count = {k: 0 for k in all_question_types}
    for sample in data:
        modality_type_count[sample["modality_type"]] += 1
        question_type_count[sample["question_type"]] += 1
    return modality_type_count, question_type_count

def gather_stat_types(vqa_json_list, all_modality_types, all_question_types):
    vqa_json_stats = defaultdict(dict)
    for i, vqa_json in enumerate(vqa_json_list):
        with open(vqa_json, "r") as f:
            data = json.load(f)
            modality_type_count, question_type_count = stat_types(data, all_modality_types, all_question_types)
            vqa_json_stats[vqa_json.stem]["modality_type_count"] = modality_type_count
            vqa_json_stats[vqa_json.stem]["question_type_count"] = question_type_count
    return vqa_json_stats
vqa_json_stats = gather_stat_types(vqa_json_list, all_modality_types, all_question_types)

In [36]:
modality_type_df = pd.DataFrame.from_dict(
    {k: v["modality_type_count"] for k, v in vqa_json_stats.items()},
    orient="index",
)
question_type_df = pd.DataFrame.from_dict(
    {k: v["question_type_count"] for k, v in vqa_json_stats.items()},
    orient="index",
)

In [38]:
display(modality_type_df)
display(question_type_df)

Unnamed: 0,OCT (Optical Coherence Tomography,Fundus Photography,MR (Mag-netic Resonance Imaging),CT(Computed Tomography),Microscopy Images,ultrasound,X-Ray,Dermoscopy
Chest CT Scan,0,0,0,871,0,0,0,0
SARS-CoV-2 CT-scan,0,0,0,910,0,0,0,0
ISIC2019,0,0,0,0,0,0,0,1952
Adam Challenge,0,87,0,0,0,0,0,0
OLIVES,0,593,0,0,0,0,0,0
Mura,0,0,0,0,0,0,1464,0
ISBI2016,0,0,0,0,0,0,0,681
Retinal OCT-C8,4016,0,0,0,0,0,0,0
ISIC2018,0,0,0,0,0,0,0,272
Fitzpatrick 17k,0,0,0,0,0,0,0,1552


Unnamed: 0,Modality Recognition,Other Biological Attributes,Anatomy Identification,Lesion Grading,Disease Diagnosis
Chest CT Scan,10,268,0,268,325
SARS-CoV-2 CT-scan,15,0,298,0,597
ISIC2019,315,0,0,0,1637
Adam Challenge,10,0,0,0,77
OLIVES,154,0,439,0,0
Mura,252,0,424,0,788
ISBI2016,69,0,136,0,476
Retinal OCT-C8,349,1697,0,0,1970
ISIC2018,15,0,84,0,173
Fitzpatrick 17k,265,0,0,0,1287
