In [3]:
import os
import re
import glob
import pydicom
from pydicom.pixel_data_handlers import apply_windowing #
import cv2
import numpy as np
from tqdm import tqdm

1) convert without resizing
2) cut off empty space
3) windowing
4) resize to 1024x768 - HxW


In [4]:
path_dicom = '../train_images/'
path_png = '../png_roi/'
images_dicom = glob.glob(path_dicom+ '*/*.dcm')

### Step 1: converting dicom to grayscale png and apply windowing

In [6]:
def convert_dcm_to_png_windowing(path_image: str, dir_save_to: str) -> None:
    
    id_patient, id_img = re.findall('(\d+)', path_image)[-2:]
    path_save = os.path.join(dir_save_to, id_patient) 
    os.makedirs(path_save, exist_ok=True)
        
    dicom = pydicom.dcmread(path_image)
    
    img = dicom.pixel_array
    img = apply_windowing(img, dicom) #windowing itself
    
    img = (img - img.min()) / (img.max() - img.min()) # to [0;1] scale
    
    if dicom.PhotometricInterpretation == 'MONOCHROME1':
        img = 1 - img
    
    img = (img * 255).astype(np.uint8)
        
    cv2.imwrite(os.path.join(path_save, id_img + '.png') , img)  

In [7]:
for image in tqdm(images_dicom[:10]): 
    convert_dcm_to_png_windowing(image, path_png)

100%|███████████████████████████████████████████| 10/10 [00:07<00:00,  1.35it/s]


### Cutting off empty regions

In [7]:
def crop_img(image: str, dir_save_to: str) -> None:
    
    X = cv2.imread(image)
    
    # Some images have narrow exterior "frames" that complicate selection of the main data. Cutting off the frame
    X = X[5:-5, 5:-5]
    
    # regions of non-empty pixels
    output= cv2.connectedComponentsWithStats((X > 20).astype(np.uint8)[:, :, 0], 8, cv2.CV_32S) # connectivity 4 insted of 8 may give us more regions which we don't want

    # output[0] is a number of labels
    # output[1] is matrix of labels
    # output[2] is stat matrix
    # output[4] is centroid matrix
    
    # stats.shape == (N, 5), where N is the number of regions, 5 dimensions correspond to:
    # left, top, width, height, area_size
    stats = output[2]
    
    # finding max area which always corresponds to the breast data. 
    idx = stats[1:, 4].argmax() + 1 # starting with 1 because largest region will be the whole picture
    x1, y1, w, h = stats[idx][:4]
    x2 = x1 + w
    y2 = y1 + h
    
    # cutting out the breast data
    X_roi = X[y1: y2, x1: x2]
    
    patient_id, id_img = re.findall('(\d+)', image)[-2:]
    cv2.imwrite(f'{dir_save_to}/{patient_id}/{id_img}.png', X_roi[:, :, 0])


In [18]:
images_png = glob.glob(path_png+'*/*.png')

for image in tqdm(images_png):
    crop_img(image, path_png)

100%|███████████████████████████████████████████| 10/10 [00:01<00:00,  8.09it/s]


### Resize

In [19]:
def resize_image(image: str, dir_save_to: str, size: tuple[int,int]=(768, 1024)) -> None:

    img = cv2.imread(image, cv2.IMREAD_GRAYSCALE)
    img = cv2.resize(img, dsize=size)
    
    patient_id, id_img = re.findall('(\d+)', image)[-2:]
    cv2.imwrite(f'{path_png}/{patient_id}/{id_img}.png', img)

In [20]:
for image in tqdm(images_png):
    resize_image(image, path_png)

100%|███████████████████████████████████████████| 10/10 [00:00<00:00, 33.30it/s]
