In [None]:
import pandas as pd
import numpy as np
import pydicom

from glob import glob
from tqdm import tqdm_notebook
from multiprocessing.pool import ThreadPool

In [None]:
def get_dicoms_from_folder(folder_name, jobs=30):
    try:
        list_url = glob(folder_name)
        dcms = list(ThreadPool(jobs).imap_unordered(pydicom.dcmread, list_url))
    except Exception as e:
        raise Exception('error al paralelizar la descarga ' + str(e))
    return dcms

def get_vol_from_dcms(dcms):
    dcms.sort(key=lambda x: float(x.ImagePositionPatient[2]))
    try:
        vol = []
        spacing = np.array(dcms[0].PixelSpacing).astype(np.float)
        thickness = float(abs(dcms[0].ImagePositionPatient[2] - dcms[1].ImagePositionPatient[2]))
        spacing = np.concatenate([[thickness], spacing])

        for dcm in dcms:
            intercept = int(dcm.RescaleIntercept)
            slope = int(dcm.RescaleSlope)
            img_2d = dcm.pixel_array.copy().astype(np.int16)
            if slope != 1:
                img_2d *= slope

            img_2d += intercept
            vol.append(img_2d)

        vol = np.array(vol, dtype=np.int16)
    except Exception as e:
        print(str(e))
        raise Exception('error extraer el volumen del dicom ' + str(e))
    return vol, spacing

In [None]:
df = pd.read_csv("./csvs/final_data.csv")

# agg_func = {'calcification': 'max', 'internalStructure': 'max', 'lobulation': 'max', 'malignancy': 'max',
            'margin': 'max', 'path': 'first', 'sphericity': 'max', 'spiculation': 'max', 'subtlety': 'max', 'texture': 'max',
            'xf': 'first', 'xi': 'first', 'yf': 'first', 'yi': 'first', 'zf': 'first', 'zi': 'first'}

df = df.groupby("final_id").agg(agg_func)

df.head()

df[['calcification', 'internalStructure', 'lobulation', 'malignancy', 'margin', 'sphericity', 'spiculation', 'subtlety', 'texture']].max()

df[['calcification', 'internalStructure', 'lobulation', 'malignancy', 'margin', 'sphericity', 'spiculation', 'subtlety', 'texture']] = df[['calcification', 'internalStructure', 'lobulation', 'malignancy', 'margin', 'sphericity', 'spiculation', 'subtlety', 'texture']]/df[['calcification', 'internalStructure', 'lobulation', 'malignancy', 'margin', 'sphericity', 'spiculation', 'subtlety', 'texture']].max()

In [None]:
df = df[['path', 'final_id', 'xf', 'xi', 'yf', 'yi', 'zf', 'zi']]

In [None]:
len(df.final_id)

6860

In [None]:
paths = df.path.unique()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
for path in tqdm_notebook(paths):
    dcms = get_dicoms_from_folder(path+'/*.dcm')
    vol, spacing = get_vol_from_dcms(dcms)
    df2 = df[df.path == path]
    for row in df2.iterrows():
        _, final_id, xf, xi, yf, yi, zf, zi = row[1]
        path_new = './nodules/{}.npz'.format(final_id)

        patch = vol[zi:zf, yi:yf, xi:xf]
        np.savez_compressed(path_new, patch = patch)

HBox(children=(IntProgress(value=0, max=883), HTML(value='')))


