In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import math
from glob import glob
import re

import pydicom
from multiprocessing.pool import Pool

In [83]:
from scipy.ndimage import rotate

In [2]:
from tqdm import tqdm_notebook
from multiprocessing.pool import ThreadPool

In [3]:
df = pd.read_csv("./csvs/final_data.csv")

In [4]:
root_mask_path = "Y:/LIDC-IDRI_final_masks/LIDC-IDRI-{}.npz"

In [5]:
df = df[['path', 'final_id', 'xf', 'xi', 'yf', 'yi', 'zf', 'zi']]

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
def transform(row):
    radios = np.array([math.ceil((row["xf"] - row["xi"])/2), math.ceil((row["yf"] - row["yi"])/2), math.ceil((row["zf"] - row["zi"])/2)])
    x = radios[0] + row["xi"]
    y = radios[1] + row["yi"]
    z = radios[2] + row["zi"]
    row["coordX"] = x
    row["coordY"] = y
    row["coordZ"] = z
    row["radio_mm"] = int(radios.max())
    return row

In [8]:
df = df.apply(transform, axis=1)

In [9]:
def get_dicoms_from_folder(folder_name, jobs=30):
    try:
        list_url = glob(folder_name)
        dcms = list(ThreadPool(jobs).imap_unordered(pydicom.dcmread, list_url))
    except Exception as e:
        raise Exception('error al paralelizar la descarga ' + str(e))
    return dcms

def get_vol_from_dcms(dcms):
    dcms.sort(key=lambda x: float(x.ImagePositionPatient[2]))
    try:
        vol = []
        spacing = np.array(dcms[0].PixelSpacing).astype(np.float)
        thickness = float(abs(dcms[0].ImagePositionPatient[2] - dcms[1].ImagePositionPatient[2]))
        spacing = np.concatenate([[thickness], spacing])

        for dcm in dcms:
            intercept = int(dcm.RescaleIntercept)
            slope = int(dcm.RescaleSlope)
            img_2d = dcm.pixel_array.copy().astype(np.int16)
            if slope != 1:
                img_2d *= slope

            img_2d += intercept
            vol.append(img_2d)

        vol = np.array(vol, dtype=np.int16)
    except Exception as e:
        print(str(e))
        raise Exception('error extraer el volumen del dicom ' + str(e))
    return vol, spacing

In [10]:
train, test = np.load('splited.npy', allow_pickle=True)

In [16]:
paths = [*train, *test]

In [12]:
n = 1
for path in tqdm_notebook(paths):
#def process(path):
    for nn in range(n):
        # cargando coordenadas de los nodulos
        dcms = get_dicoms_from_folder(path+'/*.dcm')
        vol, spacing = get_vol_from_dcms(dcms)
        case = re.search('LIDC-IDRI-(.+?)/', path).group(1)

        mask_path = root_mask_path.format(case)

        file = np.load(mask_path)
        mask = file['mask']
        file.close()

        nodules = df[df["path"] == path][['coordX', 'coordY', 'coordZ', 'radio_mm']]
        nodules = nodules.round().astype(int)[['coordZ', 'coordY', 'coordX', 'radio_mm']]

        c = []
        # recorriendo las cordenadas de los nodulos y almacenado las nuevas coordenadas
        for i in nodules.values:
            
            minimun = np.maximum(0, (i[0] - 64) + i[3])
            maximum = i[0] - i[3] if ((i[0] - i[3]) + 64) <= vol.shape[0] else vol.shape[0] - 64
            z_i = maximum if maximum <= minimun else np.random.randint(np.min(minimun), np.max(maximum))
            
            minimun = np.maximum(0, (i[1] - 64) + i[3])
            maximum = i[1] - i[3] if ((i[1] - i[3]) + 64) <= vol.shape[1] else vol.shape[1] - 64
            y_i = maximum if maximum <= minimun else np.random.randint(np.min(minimun), np.max(maximum))

            
            minimun = np.maximum(0, (i[2] - 64) + i[3])
            maximum = i[2] - i[3] if ((i[2] - i[3]) + 64) <= vol.shape[2] else vol.shape[2] - 64
            x_i = maximum if maximum <= minimun else np.random.randint(np.min(minimun), np.max(maximum))
            
            c.append([z_i, y_i, x_i])
        c = np.array(c)

        # recorriendo las nuevas coordenadas y almacenando los cubos de 64x64x64
        for i, cc in enumerate(c):
            patch = vol[cc[0]:cc[0] + 64, cc[1]:cc[1] + 64, cc[2]:cc[2] + 64]
            patch_m = mask[cc[0]:cc[0] + 64, cc[1]:cc[1] + 64, cc[2]:cc[2] + 64]

            # se guarda como un numpy array compreso
            if np.isin(path, train):
                np.savez_compressed('./sin_aumentar/train/{}_{}_{}.npz'.format(nn, case, i), vol = patch, mask = patch_m.astype(np.int8))
            else:
                np.savez_compressed('./sin_aumentar/test/{}_{}_{}.npz'.format(nn, case, i), vol = patch, mask = patch_m.astype(np.int8))

HBox(children=(IntProgress(value=0, max=883), HTML(value='')))




In [None]:
####################################################################################################
####################################################################################################
####################################################################################################
####################################################################################################

In [108]:
paths_sample = [*np.random.choice(train, 100), *np.random.choice(test, 20)]

In [109]:
values = np.array([[0, 0, 0], [0, 64, 64], [0, 0, 64], [0, 0, 128]])

In [110]:
angles = [0, 90, 180, 270]

In [111]:
n = 1
for path in tqdm_notebook(paths_sample):
    for nn in range(n):
        # cargando coordenadas de los nodulos
        dcms = get_dicoms_from_folder(path+'/*.dcm')
        vol, spacing = get_vol_from_dcms(dcms)
        case = re.search('LIDC-IDRI-(.+?)/', path).group(1)

        mask_path = root_mask_path.format(case)

        file = np.load(mask_path)
        mask = file['mask']
        file.close()

        c = values[np.random.randint(0, len(values), 1)]
    
        # recorriendo las nuevas coordenadas y almacenando los cubos de 64x64x64
        for i, cc in enumerate(c):
            patch = vol[cc[0]:cc[0] + 64, cc[1]:cc[1] + 64, cc[2]:cc[2] + 64]
            patch_m = mask[cc[0]:cc[0] + 64, cc[1]:cc[1] + 64, cc[2]:cc[2] + 64]
            
            angle = np.random.choice(angles)
            patch = rotate(patch, angle, (1, 2))
            patch_m = rotate(patch_m, angle, (1, 2))

            # se guarda como un numpy array compreso
            if np.isin(path, train):
                np.savez_compressed('./sin_aumentar/train/n_{}_{}_{}.npz'.format(nn, case, i), vol = patch, mask = patch_m.astype(np.int8))
            else:
                np.savez_compressed('./sin_aumentar/test/n_{}_{}_{}.npz'.format(nn, case, i), vol = patch, mask = patch_m.astype(np.int8))

HBox(children=(IntProgress(value=0, max=120), HTML(value='')))