In [3]:
import pathlib
import re
import json
import pandas as pd
from collections import defaultdict
from tqdm import tqdm

# == Base ==
DATA_DIR = pathlib.Path("/data2") / "radiology_datas"
# == Dataset ==
ADNI1 = DATA_DIR / "adni1" / "stripped_cloud"
ADNI2 = DATA_DIR / "adni2" / "stripped_cloud"
ADNI2_2 = DATA_DIR / "adni2_2" / "stripped_cloud"
PPMI = DATA_DIR / "PPMI" / "stripped_cloud"
FourRTNI = DATA_DIR / "4RTNI" /"stripped_skull"

BLACKLIST_DIR = DATA_DIR / "black_lists"
NOT_CSV_DIR = DATA_DIR / "not_csv"

DATA_CSV = {
    "ADNI": DATA_DIR / "csv" / "ADNIMERGE.csv",
    "PPMI": DATA_DIR / "csv" / "PPMI.csv",
    "4RTNI": DATA_DIR / "csv" / "4RTNI_DATA.csv",
    "ADNI_s": DATA_DIR / "csv" / "ADNI_summary_sheet.csv",
}

DATA_DIRS_DICT = {
    "ADNI1": ADNI1,
    "ADNI2": ADNI2,
    "ADNI2-2": ADNI2_2,
    "PPMI": PPMI,
    "4RTNI": FourRTNI
}

PTID = {"ADNI": "PTID", "PPMI": "Subject", "4RTNI": "SUBID", "ADNI_s": "subjectIdentifier"}
PTCLASS = {"ADNI": "DX_bl", "PPMI": "Group", "4RTNI": "DX", "ADNI_s": "subjectInfo: DX Group"}

In [4]:
def get_uid(path):
    uid = path.name
    for key, value in DATA_DIRS_DICT.items():
        if str(value) in str(path):

            if key == "ADNI2":
                uid = path.name.split("_")[-2]
                uid = int(uid[1:])

            elif key == "ADNI2-2":
                uid = path.name.split("_")[-4]
                uid = int(uid[1:])

            elif key == "PPMI":
                uid = path.name.split("_")[-4]
                uid = int(uid)

            elif key == "4RTNI":
                uid = path.name.split("_")[-4]
                uid = int(uid)

            return uid

def get_blacklist(dir):
    key = "**/uids.txt"
    excluded_uid_paths = dir.glob(key)
    excluded_uids = []
    for path in excluded_uid_paths:
        with open(path, "r") as rf:
            [excluded_uids.append(int(uid.rstrip("\n"))) for uid in rf]
    return excluded_uids

def get_not_csv_list(dir):
    key = "**/pids.txt"
    excluded_uid_paths = dir.glob(key)
    excluded_uids = []
    for path in excluded_uid_paths:
        with open(path, "r") as rf:
            [excluded_uids.append(uid.rstrip("\n")) for uid in rf]
    return excluded_uids
    

black_list = get_blacklist(BLACKLIST_DIR)
not_csv_list = get_not_csv_list(NOT_CSV_DIR)

In [7]:
contents = []

key = "**/pids.txt"
excluded_uid_paths = NOT_CSV_DIR.glob(key)
for path in excluded_uid_paths:
    with open(path, "r") as rf:
        excluded_uids=[]
        ptclass = str(path).split("/")[-2]
        dataset = str(path).split("/")[-3]

        [excluded_uids.append(uid.rstrip("\n")) for uid in rf]
        
        add_list = defaultdict(lambda :defaultdict(list))
        for ptid in excluded_uids:
            
            path = pathlib.Path(DATA_DIRS_DICT[dataset] / ptclass / ptid)
            file = [p for p in path.glob('**/*') if re.search('/*\.(pkl|npy)', str(p))]
            for f in file:
                f_temp = str(f).split("/")[-1]
                uid = get_uid(f)
                #print(f)

                if f_temp.startswith("fullsize"):
                    add_list[ptid][uid].append(("full", str(f)))

                if f_temp.startswith("half"):
                    add_list[ptid][uid].append(("half", str(f)))
        
        for ptid, add_uid in add_list.items():
            images = []
            for uid, ls in add_uid.items():
                if uid not in black_list:
                    blacklisted = False
                else:
                    blacklisted = True

                for l in ls:
                    if l[0] == "full":
                        full = l[1]
                    elif l[0] == "half":
                        half = l[1]
                images.append(
                    {
                        "uid": uid,
                        "blacklisted": blacklisted,
                        "fullsize_img_path": full,
                        "halfsize_img_path": half
                    }
                )  
            content = {
                "id": ptid,
                "class": ptclass,
                "images": images,
                "dataset": dataset,
                "not_csv": True
            }
            contents.append(content)

In [8]:
#contents = []

ptid_list = set([])
for csv_label, csv in DATA_CSV.items():
    if csv_label == "ADNI_s":
        #df = pd.read_excel(csv, engine='openpyxl')
        df = pd.read_csv(csv)
    else:
        df = pd.read_csv(csv)
    if csv_label == "4RTNI":
        df["DX"] = df["DX"].map({"CBS": "CBD", "PSP": "PSP", "Oth": "Oth"}, na_action=None)

    for data in tqdm(df.to_dict(orient="records")):
        ptclass = data[PTCLASS[csv_label]]
        ptid = str(data[PTID[csv_label]])
        
        if (ptid, ptclass) in ptid_list:
            continue
        else:
            ptid_list.add((ptid, ptclass))
            
        files = []
        for label, dir in DATA_DIRS_DICT.items():
            try:
                path = pathlib.Path(dir / ptclass / ptid)
            except TypeError:
                path = pathlib.Path(dir / "Nan" / ptid)
                print(label, dir, ptclass, ptid, flush=True)
                
            file = [p for p in path.glob('**/*') if re.search('/*\.(pkl|npy)', str(p))]
            # files = list(path.glob("*.[p,n][k,p][l,y]"))
            if len(file) != 0:
                files += file
                dataset = label

        add_list = defaultdict(lambda :defaultdict(list))
        for f in files:
            f_temp = str(f).split("/")[-1]
            uid = get_uid(f)
            
            if f_temp.startswith("fullsize"):
                add_list[ptid][uid].append(("full", str(f)))
                
            if f_temp.startswith("half"):
                add_list[ptid][uid].append(("half", str(f)))
            
                
        for ptid, add_uid in add_list.items():
            images = []
            
            if ptid not in not_csv_list:
                not_csv = False
            else:
                not_csv = True
            
            for uid, ls in add_uid.items():
                if uid not in black_list:
                    blacklisted = False
                else:
                    blacklisted = True
                    
                for l in ls:
                    if l[0] == "full":
                        full = l[1]
                    elif l[0] == "half":
                        half = l[1]
            
                images.append(
                    {
                        "uid": uid,
                        "blacklisted": blacklisted,
                        "fullsize_img_path": full,
                        "halfsize_img_path": half
                    }
                )     
            content = {
                "id": ptid,
                "class": ptclass,
                "images": images,
                "dataset": dataset,
                "not_csv": False
            }
            contents.append(content)

  # break

with open('./all_subjects.json', 'w') as f:
    json.dump(contents, f, ensure_ascii=False, indent=2)

print(len(contents))

100%|██████████| 12729/12729 [00:01<00:00, 10783.19it/s]
100%|██████████| 500/500 [00:00<00:00, 2279.36it/s]
  0%|          | 0/124 [00:00<?, ?it/s]

ADNI1 /data2/radiology_datas/adni1/stripped_cloud nan 4_S_5010
ADNI2 /data2/radiology_datas/adni2/stripped_cloud nan 4_S_5010
ADNI2-2 /data2/radiology_datas/adni2_2/stripped_cloud nan 4_S_5010
PPMI /data2/radiology_datas/PPMI/stripped_cloud nan 4_S_5010
4RTNI /data2/radiology_datas/4RTNI/stripped_skull nan 4_S_5010


100%|██████████| 124/124 [00:00<00:00, 1191.95it/s]
100%|██████████| 3600/3600 [00:00<00:00, 9918.78it/s] 


1772


In [9]:
from load_images import load_images
D = {'ADNI2','ADNI2-2'}
C = {'AD'}
U = True
B = True
N = True

images = load_images(datasets=D, classes=C, size="half", unique=U, blacklist=B, add_csv=N, dryrun=True)
print(len(images))

294
