# Goal

This notebook saves a .txt and .pkl file:
- .txt: a list of MR paths (filtered between 100-300 slices)
- .pkl: dataframe w/ file metadata (filename, size, spacing, direction, etc)

Available MR data from:
- ABIDE.txt (1160 files)
- ABVIB.txt (778 files)
- ACRIN-FMISO-Brain.txt (1403 files)
- ADNI.txt (2640 files)
- PPMI.txt (1524 files)

# Imports

In [1]:
# imports

import os
import time
import pickle
from pathlib import Path

import SimpleITK as sitk

import numpy as np
from pandas import DataFrame as DF

from helpers_general import sitk2np, print_sitk_info, round_tuple, lrange, lmap, get_roi_range, numbers2groups

# Path to data

In [15]:
# wsl: /home/rgologorsky/DeepPit
hd_path = "../" * 5 + "/media/labcomputer/e33f6fe0-5ede-4be4-b1f2-5168b7903c7a" + "/home/rachel/PitMRdata/samir_labels"

folders = os.listdir(hd_path)
print(f"#folders = {len(folders)}", folders, sep="\n")

#folders = 5
['50155-50212', '50373-50453', '50002-50153', '50213-50312', '50313-50372']


# Get list of terminal folders

In [81]:
# modified os.walk to stop at last subdir
def walk_to_series(top):
    names = os.listdir(top)
    subdirs = [name for name in names if os.path.isdir(os.path.join(top, name))]

    mr_paths = []
    
    # terminal folder
    if len(subdirs) == 0:
        mr_paths.append(top)

    # recurse on subdirs
    for subdir in subdirs:
        newpath = os.path.join(top, subdir)
        mr_paths += walk_to_series(newpath)
        
    return mr_paths

def get_terminal_folders(top):    
        
    # start timer
    start = time.time() 
    
    # get series paths
    mr_paths = walk_to_series(f"{hd_path}/{top}")
        
    # end timer
    elapsed = time.time() - start
    print(f"Elapsed: {elapsed} s for {len(mr_paths)} files.")
    
    # save results
    with open(f"{top}.txt", "wb") as fp:   #Pickling
        pickle.dump(mr_paths, fp)
        
    return mr_paths

In [102]:
for folder in folders: 
    print(folder, end=': ')
    mr_paths = get_terminal_folders(folder)
    #print(*mr_paths[:2], sep="\n")

50155-50212: Elapsed: 0.009264469146728516 s for 48 files.
50373-50453: Elapsed: 0.011237859725952148 s for 66 files.
50002-50153: Elapsed: 0.012895584106445312 s for 101 files.
50213-50312: Elapsed: 0.005379915237426758 s for 60 files.
50313-50372: Elapsed: 0.004918336868286133 s for 60 files.


# Metadata

folder	imputedSeq	fn	sz	px	sp	dir

In [87]:
def get_folder_name(s):
    s = s[len("../../../../..//media/labcomputer/e33f6fe0-5ede-4be4-b1f2-5168b7903c7a/home/rachel/PitMRdata/samir_labels/50155-50212/"):]
    return s[0:s.index("/")]

In [103]:
get_folder_name(mr_paths[0])

'50368'

In [88]:
def get_imputed_seq(fn):
    for seq in ("MPR", "RAGE", "T1", "T2", "FLAIR", "WOW"):
        if seq.lower() in fn or seq.upper() in fn:
            if seq == "RAGE": return "MPR"
            else: return seq
    return "UNKNOWN"

In [89]:
# get extension
# assume all files in dir have same extension (eg .dcm)
def get_ext(dir_path):
    # assume all files in dir have same extension
    file = next(os.walk(dir_path))[2][0]
    
    # in case file is bytes not str
    try:
        file = file.decode()
    except:
        pass
    
    return Path(file).suffix.lower()

In [138]:
def read_metadata(top):

    # get list of files
    with open(f"{top}.txt", "rb") as fp:
        mr_paths = pickle.load(fp)
    
    d = []
    for path in mr_paths:

        # get file ext (nii, dcm, etc)
        ext    = get_ext(path)
        seq    = get_imputed_seq(path)
        
        # get folder name = data src
        folder = get_folder_name(path)

        # get 1st child file in terminal folder
        file = os.listdir(str(path))[0]
        file = f"{path}/{file}"

        # ASSUMES only 1 nii in folder

        if ext == ".nii" or ext == ".img":

            # read meta data
            reader = sitk.ImageFileReader()
            reader.SetImageIO("NiftiImageIO")
            reader.SetFileName(file)
            reader.ReadImageInformation()

            # get num slices
            sz = reader.GetSize()
            n  = min(sz)

        elif ext == ".dcm":
            n = len(os.listdir(str(path)))

            # read meta data
            reader = sitk.ImageFileReader()
            reader.SetFileName(file)
            reader.ReadImageInformation()

            # add n_slices to size
            sz = reader.GetSize()
            if sz[2] == 1:
                sz = (sz[0], sz[1], n)

        else:
            print(f"Weird ext - {ext}.")

        # save
        if n > 100 and n < 300:
            d.append({
                "folder": folder,
                "fn": file,
                "imputedSeq": seq,
                "sz": sz,
                "px": sitk.GetPixelIDValueAsString(reader.GetPixelID()),
                "sp": tuple(round(x,2) for x in reader.GetSpacing()),
                "dir": tuple(int(round(x,1)) for x in reader.GetDirection())
            })
        
    # save dataframe
    d = DF(d)
    
    #print(f"./{top}-test.pkl")
    #d.to_pickle(f"./{top}-test.pkl")
        
    return d

In [139]:
frames = [read_metadata(folder) for folder in folders]
result = pd.concat(frames)
    
# save dataframe
result.to_pickle(f"./samir_labels.pkl")

#print(f"./{top}-test.pkl")
#d.to_pickle(f"./{top}-test.pkl")

In [141]:
result.fn.values[0]

'../../../../..//media/labcomputer/e33f6fe0-5ede-4be4-b1f2-5168b7903c7a/home/rachel/PitMRdata/samir_labels/50155-50212/50198/MP-RAGE/2000-01-01_00_00_00.0/S164832/ABIDE_50198_MRI_MP-RAGE_br_raw_20120830184954222_S164832_I328842.nii'

In [134]:
import pandas as pd

folder = folders[1]
read_metadata(folder)

./50373-50453-test.pkl


Unnamed: 0,folder,fn,imputedSeq,sz,px,sp,dir
0,50422,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(106, 256, 256)",16-bit signed integer,"(1.4, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
1,50380,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(256, 256, 124)",16-bit signed integer,"(1.02, 1.02, 1.2)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
2,50411,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(120, 256, 256)",16-bit signed integer,"(1.4, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
3,50433,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 480, 512)",16-bit signed integer,"(1.2, 0.5, 0.5)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
4,50404,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(106, 256, 256)",16-bit signed integer,"(1.4, 0.86, 0.86)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
...,...,...,...,...,...,...,...
61,50436,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 480, 512)",16-bit signed integer,"(1.2, 0.5, 0.5)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
62,50385,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(256, 256, 124)",16-bit signed integer,"(1.02, 1.02, 1.2)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
63,50424,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(106, 256, 256)",16-bit signed integer,"(1.4, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
64,50449,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 480, 512)",16-bit signed integer,"(1.2, 0.5, 0.5)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"


In [135]:
unpickled_df = pd.read_pickle(f"./{folder}.pkl")
unpickled_df

Unnamed: 0,folder,fn,imputedSeq,sz,px,sp,dir
0,50373-50453,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(106, 256, 256)",16-bit signed integer,"(1.4, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"


In [136]:
print(f"Folder {folder}.")

Folder 50373-50453.


In [137]:
unpickled_df = pd.read_pickle(f"./{folder}-test.pkl")
display(unpickled_df)

Unnamed: 0,folder,fn,imputedSeq,sz,px,sp,dir
0,50422,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(106, 256, 256)",16-bit signed integer,"(1.4, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
1,50380,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(256, 256, 124)",16-bit signed integer,"(1.02, 1.02, 1.2)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
2,50411,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(120, 256, 256)",16-bit signed integer,"(1.4, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
3,50433,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 480, 512)",16-bit signed integer,"(1.2, 0.5, 0.5)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
4,50404,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(106, 256, 256)",16-bit signed integer,"(1.4, 0.86, 0.86)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
...,...,...,...,...,...,...,...
61,50436,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 480, 512)",16-bit signed integer,"(1.2, 0.5, 0.5)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
62,50385,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(256, 256, 124)",16-bit signed integer,"(1.02, 1.02, 1.2)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
63,50424,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(106, 256, 256)",16-bit signed integer,"(1.4, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
64,50449,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 480, 512)",16-bit signed integer,"(1.2, 0.5, 0.5)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"


In [120]:
# save results
with open(f"{path}.txt", "wb") as fp:   #Pickling
    pickle.dump(mr_paths, fp)

In [29]:
# data frame w/ meta data info
d = []

for folder in folders[1:]:
    # get list of files
    with open(f"{folder}.txt", "wb") as fp:   #Pickling
        mr_paths = pickle.load(fp)
        
    for path in mr_paths:

        # get folder name = data src
        folder = get_folder_name(path)

        # get file ext (nii, dcm, etc)
        ext    = get_ext(path)
        seq    = get_imputed_seq(path)

        # get 1st child file in terminal folder
        file = os.listdir(str(path))[0]
        file = f"{path}/{file}"

        # ASSUMES only 1 nii in folder

        if ext == ".nii" or ext == ".img":

            # read meta data
            reader = sitk.ImageFileReader()
            reader.SetImageIO("NiftiImageIO")
            reader.SetFileName(file)
            reader.ReadImageInformation()

            # get num slices
            sz = reader.GetSize()
            n  = min(sz)

        elif ext == ".dcm":
            n = len(os.listdir(str(path)))

            # read meta data
            reader = sitk.ImageFileReader()
            reader.SetFileName(file)
            reader.ReadImageInformation()

            # add n_slices to size
            sz = reader.GetSize()
            sz = (sz[0], sz[1], n)

        else:
            print(f"Weird ext - {ext}.")

        # save
        if n > 100 and n < 300:
            d.append({
                "folder": folder,
                "fn": file,
                "imputedSeq": seq,
                "sz": sz,
                "px": sitk.GetPixelIDValueAsString(reader.GetPixelID()),
                "sp": tuple(round(x,2) for x in reader.GetSpacing()),
                "dir": tuple(int(round(x,1)) for x in reader.GetDirection())
            })

    # save dataframe
    d = DF(d)
    d.to_pickle(f"./{folder}.pkl")
    
    d

Unnamed: 0,folder,fn,imputedSeq,sz,px,sp,dir
0,50198,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
1,50171,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 239, 200)",16-bit signed integer,"(1.1, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
2,50190,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
3,50188,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
4,50212,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
5,50200,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
6,50157,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 239, 200)",16-bit signed integer,"(1.1, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
7,50196,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
8,50167,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 239, 200)",16-bit signed integer,"(1.1, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
9,50193,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"


In [31]:
path

'../../../../..//media/labcomputer/e33f6fe0-5ede-4be4-b1f2-5168b7903c7a/home/rachel/PitMRdata/samir_labels/50155-50212/50169/MP-RAGE/2000-01-01_00_00_00.0/S165727'

In [32]:
d.to_pickle("./50155-50212.pkl")

In [35]:
import pandas as pd

In [36]:
import
unpickled_df = pd.read_pickle("./50155-50212.pkl")
unpicled_df

In [37]:
unpickled_df

Unnamed: 0,folder,fn,imputedSeq,sz,px,sp,dir
0,50198,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
1,50171,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 239, 200)",16-bit signed integer,"(1.1, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
2,50190,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
3,50188,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
4,50212,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
5,50200,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
6,50157,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 239, 200)",16-bit signed integer,"(1.1, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
7,50196,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
8,50167,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(160, 239, 200)",16-bit signed integer,"(1.1, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
9,50193,../../../../..//media/labcomputer/e33f6fe0-5ed...,MPR,"(172, 256, 256)",16-bit signed integer,"(1.0, 1.0, 1.0)","(1, 0, 0, 0, -1, 0, 0, 0, 1)"
