In [10]:
import os
import re
import glob
import shutil
import numpy as np
import pandas as pd
from tqdm import tqdm

import pydicom
from pydicom.pixel_data_handlers import apply_windowing #
import cv2


1) convert without resizing
2) cut off empty space
3) windowing
4) resize to 1024x768 - HxW

id_patient 59479 delete since it is of black shots only

In [11]:
path_dicom = '../train_images/'
path_png = '../png_roi/'
images_dicom = glob.glob(path_dicom+ '*/*.dcm')

### Step 1: converting dicom to grayscale png and apply windowing

In [12]:
def convert_dcm_to_png_windowing(path_image: str, 
                                 dir_save_to: str) -> None:
    
    id_patient, id_img = re.findall('(\d+)', path_image)[-2:]
    path_save = os.path.join(dir_save_to, id_patient) 
    
    if os.path.isfile(os.path.join(path_save, id_img + '.png')):
        pass
    else:
            
        dicom = pydicom.dcmread(path_image)
        
        img = dicom.pixel_array
        img = apply_windowing(img, dicom) #windowing itself
        
        img = (img - img.min()) / (img.max() - img.min()) # to [0;1] scale
        
        if dicom.PhotometricInterpretation == 'MONOCHROME1':
            img = 1 - img
        
        img = (img * 255).astype(np.uint8)
        
        os.makedirs(path_save, exist_ok=True)
        cv2.imwrite(os.path.join(path_save, id_img + '.png') , img)  

In [13]:
# images_dicom[80]/[81]/[82] corrupted? start from [-26100:]

for image in tqdm(images_dicom): 
    convert_dcm_to_png_windowing(image, path_png)

  img = (img - img.min()) / (img.max() - img.min()) # to [0;1] scale
  img = (img * 255).astype(np.uint8)
 78%|█████████████████████████▋       | 42684/54706 [3:37:40<1:01:18,  3.27it/s]


KeyboardInterrupt: 

### Cutting off empty regions

In [None]:
def crop_img(image: str, 
             dir_save_to: str) -> None:
    
    X = cv2.imread(image)
    
    # Some images have narrow exterior "frames" that complicate selection of the main data. Cutting off the frame
    X = X[5:-5, 5:-5]
    
    # regions of non-empty pixels
    output= cv2.connectedComponentsWithStats((X > 20).astype(np.uint8)[:, :, 0], 8, cv2.CV_32S) # connectivity 4 insted of 8 may give us more regions which we don't want

    # output[0] is a number of labels
    # output[1] is matrix of labels
    # output[2] is stat matrix
    # output[4] is centroid matrix
    
    # stats.shape == (N, 5), where N is the number of regions, 5 dimensions correspond to:
    # left, top, width, height, area_size
    stats = output[2]
    
    # finding max area which always corresponds to the breast data. 
    idx = stats[1:, 4].argmax() + 1 # starting with 1 because largest region will be the whole picture
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h
    
    # cutting out the breast data
    X_roi = X[y1: y2, x1: x2]
    
    patient_id, id_img = re.findall('(\d+)', image)[-2:]
    cv2.imwrite(f'{dir_save_to}/{patient_id}/{id_img}.png', X_roi[:, :, 0])


In [None]:
images_png = glob.glob(path_png+'*/*.png')

for image in tqdm(images_png):
    crop_img(image, path_png)

### Resize

In [None]:
def resize_image(image: str, 
                 dir_save_to: str, 
                 size: tuple[int,int]=(768, 1024)) -> None:

    img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, dsize=size)
    
    patient_id, id_img = re.findall('(\d+)', image)[-2:]
    cv2.imwrite(f'{path_png}/{patient_id}/{id_img}.png', img)

In [None]:
for image in tqdm(images_png):
    resize_image(image, path_png)

### Train test split

Later, we may use this DataFrame for getting auxiliary predictions, but there are some columns that will not be available in test DataFrame. I don't know if we will use tabular data at all, nor do I know if we will make use of train only columns yet. It would be best to try all 3 methods: img only; auxiliary predictions on tabular data without train only columns; auxiliary predictions with train only columns, - but hardware and time are limiting factors for me now. Anyway, I will save list of train only columns. 

In [None]:
df = pd.read_csv('train.csv')
train_only_cols = ['density', 'biopsy', 'invasive', 'BIRADS', 'difficult_negative_case']

Not like we need this df for now, but it won't hurt to take a look at data anyway.

In [None]:
df

For now, we only need image and patient ids and respective target values.

In [None]:
y = df['cancer']
X = df.drop(columns=['cancer'])

This split will be applied on images only and not on patients, so images from one patient folder may be assigned to different splits. I don't think that there is a leakage: sure thing is that if patient has cancer on one breast, chances are higher for them to have it on another breast too, but we won't be giving our ANN information on what patient is particular image assigned to.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, stratify=y, random_state=42)

In [None]:
def split_images(X: pd.DataFrame, 
                 y: pd.Series, 
                 dir_source: str, 
                 dir_destination: str) -> None:
    
    '''This function is used to sort images to folders with name of respective class, so later we can use ImageFolder to create train and test datasets.
    Inputs:
        X: train or test dataset.
        y: train or test series of same length as X, containing target variables, i.e. "cancer" column of original DataFrame.
        dir_source: directory containing all folders with patient id.
        dir_destination: directory to move train or test data to, e.g. train_images_folder.
    Output: None
        Function will create "dir_destination/class/patient_id" path and move respective pictures there'''
    

    assert len(X) == len(y), "X and y must have same amount of elements. Check shapes of X and y."

    # Moving images from DataFrame to their class folders.
    for i in range(len(X)):
        
        id_patient = str(X.iloc[i]['patient_id'])
        id_image = str(X.iloc[i]['image_id']) + '.png'
        cancer = str(y.iloc[i])
    
        path_source = os.path.join(dir_source, id_patient, id_image)
        path_destination = os.path.join(dir_destination, cancer, id_patient)
        
        if os.path.isfile(path_source):
            os.makedirs(path_destination, exist_ok=True)
            shutil.move(path_source, os.path.join(path_destination, id_image))
        else:
            continue

    #Remove empty folders
    for folder in os.listdir(dir_source): 
        try:
            os.removedirs(os.path.join(dir_source, folder))
        except:
            pass

In [None]:
dir_source = '../png_roi' 
path_train_png = '../png_roi/train_images' #train_images_png
path_test_png = '../png_roi/test_images' #test_images_png

split_images(X_train, y_train, dir_source=dir_source, dir_destination=path_train_png)
split_images(X_test, y_test, dir_source=dir_source, dir_destination=path_test_png)