In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

import os
from glob import glob
from tqdm import tqdm_notebook

from scipy.ndimage import rotate, zoom

from mpl_toolkits.axes_grid1 import ImageGrid
from matplotlib import pyplot as plt

In [None]:
def images_grid(images, grid_size):
    fig = plt.figure(figsize=(10, 10))
    grid = ImageGrid(fig, 111, nrows_ncols=grid_size, axes_pad=0.1)

    for ax, im in zip(grid, images):
        ax.imshow(im)

    plt.show()

In [None]:
vol_size = np.array([24, 24, 24])

In [None]:
df =pd.read_csv("/content/drive/MyDrive/LIDC/final_data.csv")

In [None]:
df

Unnamed: 0,calcification,internalStructure,lobulation,malignancy,margin,path,sphericity,spiculation,subtlety,texture,final_id,xf,xi,yf,yi,zf,zi
0,6,1,1,3,5,y:LIDC-IDRI/LIDC-IDRI-0039/01-01-2000-49300/30...,4,1,4,5,566.0,157,145,308,296,122,117
1,3,1,1,1,5,y:LIDC-IDRI/LIDC-IDRI-0039/01-01-2000-49300/30...,5,1,4,5,566.0,157,145,308,296,122,117
2,3,1,1,1,5,y:LIDC-IDRI/LIDC-IDRI-0039/01-01-2000-49300/30...,4,1,4,5,566.0,157,145,308,296,122,117
3,3,1,1,1,5,y:LIDC-IDRI/LIDC-IDRI-0039/01-01-2000-49300/30...,5,1,5,5,566.0,157,145,308,296,122,117
4,5,1,1,1,5,y:LIDC-IDRI/LIDC-IDRI-0039/01-01-2000-49300/30...,5,1,4,5,567.0,150,141,272,263,126,121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6855,6,1,4,3,4,y:LIDC-IDRI/LIDC-IDRI-0204/01-01-2000-CT THORA...,2,4,4,5,2040.0,375,354,376,354,35,30
6856,6,1,3,5,3,y:LIDC-IDRI/LIDC-IDRI-0204/01-01-2000-CT THORA...,2,5,5,5,2040.0,375,354,376,354,35,30
6857,6,1,2,4,4,y:LIDC-IDRI/LIDC-IDRI-0204/01-01-2000-CT THORA...,3,1,5,5,2040.0,375,354,376,354,35,30
6858,6,1,2,5,4,y:LIDC-IDRI/LIDC-IDRI-0204/01-01-2000-CT THORA...,2,2,4,5,2038.0,381,342,380,347,36,22


In [None]:
agg_func = {'calcification': 'mean', 'internalStructure': 'mean', 'lobulation': 'mean', 'malignancy': 'mean',
            'margin': 'mean', 'path': 'first', 'sphericity': 'mean', 'spiculation': 'mean', 'subtlety': 'mean', 'texture': 'mean',
            'xf': 'first', 'xi': 'first', 'yf': 'first', 'yi': 'first', 'zf': 'first', 'zi': 'first'}

In [None]:
df = df.groupby("final_id").agg(agg_func)

In [None]:
df[['calcification', 'internalStructure', 'lobulation', 'malignancy', 'margin', 'sphericity', 'spiculation', 'subtlety', 'texture']] = df[['calcification', 'internalStructure', 'lobulation', 'malignancy', 'margin', 'sphericity', 'spiculation', 'subtlety', 'texture']]/df[['calcification', 'internalStructure', 'lobulation', 'malignancy', 'margin', 'sphericity', 'spiculation', 'subtlety', 'texture']].max()

In [None]:
df = df[['path', 'calcification', 'internalStructure', 'lobulation', 'malignancy',
       'margin', 'sphericity', 'spiculation', 'subtlety', 'texture']]

In [None]:
len(df)

2686

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.index.tolist(), df, stratify=df.malignancy, test_size=0.2, random_state=100)

ids_train = X_train
ids_test = X_test

In [None]:
tags_train = (df.loc[ids_train].malignancy.values > 0.5).astype(np.int)
tags_test = (df.loc[ids_test].malignancy.values > 0.5).astype(np.int)

In [None]:
pos_train = tags_train.sum() # pos train
pos_train

1246

In [None]:
pos_test = tags_test.sum() # pos train
pos_test

311

In [None]:
neg_train = len(tags_train) - pos_train # neg train
neg_train

902

In [None]:
neg_test = len(tags_test) - pos_test # neg test
neg_test

227

In [None]:
pos_train + pos_test + neg_train + neg_test

2686

In [None]:
pos_train + pos_test + neg_train + neg_test

2686

In [None]:
needed_train = (pos_train - neg_train) * 36
needed_train

12384

In [None]:
needed_test = (pos_test - neg_test)*36
needed_test

3024

In [None]:
total_needed = needed_train + needed_test
total_needed

15408

In [None]:
# cantidad de parches luego de preprocesamiento
((pos_train + neg_train) * 36) + needed_train

89712

In [None]:
files = {file.path: file.stat().st_size for file in os.scandir('./binary/')}
files = list(files.items())
files = sorted(files, key = lambda x: x[1], reverse=True)
files = [i[0] for i in files]

FileNotFoundError: ignored

In [None]:
save_path = './nodules_augmented/{}/c_{}_{}_{}'

In [None]:
count = 0

for ii, file_path in enumerate(files):
    if count >= total_needed:
        break
    
    file = np.load(file_path)
    patch = file['patch']
    tag = file['tag'][0]
    file.close()
    
    if tag == 1:
        continue
        
    subset = "train" if count < needed_train else "test"
    
    current_shape = patch.shape
    patch = zoom(patch, vol_size/current_shape, order=0)
    
    path = save_path.format(subset, 0, 0, count)
    np.savez_compressed(path, patch=patch)
    
    count += 1



In [None]:
print(str(ii)+"/"+str(len(files)))

15979/36989


In [None]:
save_path = './nodules_augmented/{}/{}_{}_{}_{}'

In [None]:
angles = np.arange(10, 90, 10)

In [None]:
angles

array([10, 20, 30, 40, 50, 60, 70, 80])

In [None]:
reshape = False
mode = "mirror"

In [None]:
for row in tqdm_notebook(df.iterrows(), total = len(df)):
    index = row[0]
    score = row[1][4]
    
    file = np.load('./nodules/{}.npz'.format(index))
    original_patch = file['patch']
    file.close()
    
    current_shape = original_patch.shape
    original_patch = zoom(original_patch, vol_size/current_shape, order=0)
    
    subset = "train" if index in ids_train else "test"
    
    i = 0
    
    path = save_path.format(subset, "o", index, int(score > 0.5), i)
    np.savez_compressed(path, patch=original_patch)
    
    for angle in angles:
        i += 1
        patch = rotate(original_patch, angle=angle, axes=(1, 2), reshape=reshape, mode=mode)
        path = save_path.format(subset, "a", index, int(score > 0.5), i)
        np.savez_compressed(path, patch=patch)
    
    for axis in [0, 1, 2]:
        i += 1
        patch = np.flip(original_patch, axis=axis)
        path = save_path.format(subset, "a", index, int(score > 0.5), i)
        np.savez_compressed(path, patch=patch)

        for angle in angles:
            i += 1
            patch = rotate(patch, angle=angle, axes=(1, 2), reshape=reshape, mode=mode)
            path = save_path.format(subset, "a", index, int(score > 0.5), i)
            np.savez_compressed(path, patch=patch)

HBox(children=(IntProgress(value=0, max=2686), HTML(value='')))




