In [1]:
import pathlib
import re
import json
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

# == Base ==
DATA_DIR = pathlib.Path("/data") / "radiology_datas"

# == Dataset ==
ADNI1 = DATA_DIR / "ADNI1"
ADNI2 = DATA_DIR / "JHU-radiology" / "20170509"
ADNI2_2 = DATA_DIR / "JHU-radiology" / "MNI_skull_stripped" / "output"
PPMI = DATA_DIR / "JHU-radiology" / "PPMI"
FourRTNI = DATA_DIR / "JHU-radiology" / "4RTNI"

BLACKLIST_DIR = DATA_DIR / "util" / "lists"

DATA_CSV = {
    "ADNI": DATA_DIR / "JHU-radiology" / "ADNIMERGE.csv",
    "PPMI": DATA_DIR / "JHU-radiology" / "PPMI.csv",
    "4RTNI": FourRTNI / "csv" / "4RTNI_DATA.csv",
}

DATA_DIRS_DICT = {
    "ADNI1": ADNI1,
    "ADNI2": ADNI2,
    "ADNI2-2": ADNI2_2,
    "PPMI": PPMI,
    "4RTNI": FourRTNI / "SkullStripped",
}


In [2]:
def get_uid(path):
    """
    pathを受け取ってuidを返すだけ
    Args
    ----------
    path : pathlib
        pklファイルへのパス
    Return
    ----------
    uid : int
        uid
    """
    uid = path.name
    for key, value in DATA_DIRS_DICT.items():
        if str(value) in str(path):

            if key == "ADNI2":
                uid = path.name.split("_")[-2]
                uid = int(uid[1:])

            elif key == "ADNI2-2":
                uid = path.name.split("_")[-4]
                uid = int(uid[1:])

            elif key == "PPMI":
                uid = path.name.split("_")[-4]
                uid = int(uid)

            elif key == "4RTNI":
                uid = path.name.split("_")[-4]
                uid = int(uid)

            return uid


def get_blacklist(dir):
    """
    brain/util/listsの中にいるblacklistたちをuidのリストで返す
    Args
    ----------
    Return
    ----------
    uid : list of int
        uids
    """
    key = "**/uids.txt"
    excluded_uid_paths = dir.glob(key)
    excluded_uids = []
    for path in excluded_uid_paths:
        with open(path, "r") as rf:
            [excluded_uids.append(int(uid.rstrip("\n"))) for uid in rf]
    return excluded_uids

black_list = get_blacklist(BLACKLIST_DIR)


In [3]:
contents = []

PTID = {"ADNI": "PTID", "PPMI": "Subject", "4RTNI": "SUBID"}
PTCLASS = {"ADNI": "DX_bl", "PPMI": "Group", "4RTNI": "DX"}

for csv_label, csv in DATA_CSV.items():

  df = pd.read_csv(csv)
  if csv_label == "4RTNI":
    df["DX"] = df["DX"].map(
        {"CBS": "CBD", "PSP": "PSP", "Oth": "Oth"}, na_action=None)

  for data in tqdm(df.to_dict(orient="records")):
    ptclass = data[PTCLASS[csv_label]]
    ptid = str(data[PTID[csv_label]])
    for label, dir in DATA_DIRS_DICT.items():
      try:
        path = pathlib.Path(dir / ptclass / ptid)
      except TypeError:
        path = pathlib.Path(dir / "Nan" / ptid)
        print(label, dir, ptclass, ptid, flush=True)

      files = [p for p in path.glob('**/*')
               if re.search('/*\.(pkl|npy)', str(p))]
      # files = list(path.glob("*.[p,n][k,p][l,y]"))
      if len(files) != 0:
        dataset = label
        break
    # if len(files) == 0:
      # print(f"該当ファイルなし, {label=}, {ptclass=}, {ptid=}")

    add_list = defaultdict(lambda :defaultdict(list))
    for f in files:
      f_temp = str(f).split("/")[-1]

      uid = get_uid(f)
      if f_temp.startswith("fullsize"):
        add_list[ptid][uid].append(("full", str(f)))
      if f_temp.startswith("half"):
        add_list[ptid][uid].append(("half", str(f)))

    for ptid, add_uid in add_list.items():
      images = []
      for uid, ls in add_uid.items():
        if uid not in black_list:
          blacklisted = False
        else:
          blacklisted = True
        
        for l in ls:
          if l[0] == "full":
            full = l[1]
          elif l[0] == "half":
            half = l[1]

        images.append(
          {
            "uid": uid,
            "blacklisted": blacklisted,
            "fullsize_img_path": full,
            "halfsize_img_path": half
          }
        )

      content = {
          "id": ptid,
          "class": ptclass,
          "images": images,
          "dataset": dataset
      }
      contents.append(content)

with open('./all_subject.json', 'w') as f:
  json.dump(contents, f, ensure_ascii=False, indent=2)

print(len(contents))


100%|██████████| 12729/12729 [00:09<00:00, 1377.77it/s]
100%|██████████| 500/500 [00:00<00:00, 1066.73it/s]
  0%|          | 0/124 [00:00<?, ?it/s]

ADNI1 /data/radiology_datas/ADNI1 nan 4_S_5010
ADNI2 /data/radiology_datas/JHU-radiology/20170509 nan 4_S_5010
ADNI2-2 /data/radiology_datas/JHU-radiology/MNI_skull_stripped/output nan 4_S_5010
PPMI /data/radiology_datas/JHU-radiology/PPMI nan 4_S_5010
4RTNI /data/radiology_datas/JHU-radiology/4RTNI/SkullStripped nan 4_S_5010


100%|██████████| 124/124 [00:00<00:00, 1130.44it/s]


8902


In [15]:
from load_images import load_images
images = load_images(datasets={"ADNI2", "ADNI2-2", "PPMI"},
                     classes={"CN", "AD", "Control"}, size="half", unique=False, blacklist=True, dryrun=True)
print(len(images))
images[0]

13528


{'uid': 35475,
 'blacklisted': False,
 'fullsize_img_path': '/data/radiology_datas/JHU-radiology/MNI_skull_stripped/output/CN/011_S_0002/fullsize011_S_0002_2005-08-26_S9107_I35475_flipped_MNI_SS.pkl',
 'halfsize_img_path': '/data/radiology_datas/JHU-radiology/MNI_skull_stripped/output/CN/011_S_0002/half_011_S_0002_2005-08-26_S9107_I35475_flipped_MNI_SS.pkl',
 'subject_id': '011_S_0002',
 'class': 'CN',
 'dataset': 'ADNI2-2'}

In [8]:
json_open = open("./all_subject.json", "r")
json_load = json.load(json_open)
print(len(json_load))
class_set = set([j['class'] for j in json_load])
print(class_set)
json_load[0]

8902
{'CBD', 'AD', 'PSP', 'PD', 'LMCI', 'Prodromal', 'CN', 'Control', 'EMCI', 'SWEDD', 'Oth', 'SMC'}


{'id': '011_S_0002',
 'class': 'CN',
 'images': [{'uid': 35475,
   'blacklisted': False,
   'fullsize_img_path': '/data/radiology_datas/JHU-radiology/MNI_skull_stripped/output/CN/011_S_0002/fullsize011_S_0002_2005-08-26_S9107_I35475_flipped_MNI_SS.pkl',
   'halfsize_img_path': '/data/radiology_datas/JHU-radiology/MNI_skull_stripped/output/CN/011_S_0002/half_011_S_0002_2005-08-26_S9107_I35475_flipped_MNI_SS.pkl'}],
 'dataset': 'ADNI2-2'}

In [5]:
images = []
images.append(
    {
        "uid": "01",
        "blacklisted": True,
        "fullsize_img_path": "full",
        "halfsize_img_path": "half"
    }
)
images.append(
    {
        "uid": "02",
        "blacklisted": False,
        "fullsize_img_path": "full",
        "halfsize_img_path": "half"
    }
)
images

[{'uid': '01',
  'blacklisted': True,
  'fullsize_img_path': 'full',
  'halfsize_img_path': 'half'},
 {'uid': '02',
  'blacklisted': False,
  'fullsize_img_path': 'full',
  'halfsize_img_path': 'half'}]