In [1]:
import os
from tqdm import tqdm

import numpy as np
import pandas as pd
import dicomsdl
import cv2
import matplotlib.pyplot as plt

In [2]:
def get_file_paths(data_path):
    """
    walk through the directory and subdirectories using os.walk() nad returns a list of file paths
    """
    all_files = []
    for subdir, _, files in os.walk(data_path):
        for file in files:
            # create the full file path by joining the subdirectory and file name
            file_path = os.path.join(subdir, file)
            # append the file path to the list of all files
            all_files.append(file_path)
    return all_files


def get_id(path):
    return int(os.path.basename(path)[: -len(".dcm")])


In [3]:
data_path = "/media/ckelid/8CAA3D31AA3D1962/train_images/"


metadata = pd.read_csv("train.csv").merge(
    pd.DataFrame(
        [(file_path, get_id(file_path)) for file_path in get_file_paths(data_path)],
        columns=["file_path", "image_id"],
    ),
    on="image_id",
    how="left",
)
print(metadata.file_path.isna().sum())
assert not metadata.file_path.isna().sum()

0


In [4]:
class_1_n_samples = metadata[metadata.cancer == 1].shape[0]
class_0_n_samples = class_1_n_samples * 5

n_samples = class_1_n_samples + class_0_n_samples
metadata = pd.concat(
    [
        metadata[metadata.cancer == 1].sample(class_1_n_samples, random_state=42),
        metadata[metadata.cancer == 0].sample(class_0_n_samples, random_state=42),
    ]
)


In [5]:
def borders_std(image, border_width):
    return image[:, 0:border_width].std(), image[:, -border_width:-1].std()


def get_side(left_border_std, right_border_std):
    if left_border_std > right_border_std:
        return "L"
    return "R"


def process_background(image, image_border_sum, background_border_sum):
    if image_border_sum < background_border_sum:
        return 255 - image
    return image



def get_image(dicom_path):
    return dicomsdl.open(dicom_path).pixelData()


def resize_image(image, target_size):
    return cv2.resize(image, target_size, interpolation=cv2.INTER_CUBIC)


def normalize_to_0_255(image):
    return cv2.normalize(image, None, 0, 255, cv2.NORM_MINMAX, cv2.CV_8UC1)


def remove_empty_background(image, threshold=1):
    for index in range(image.shape[1])[::-1]:
        if image[:, index].std() > threshold:
            return image[:, :index]
    return image
    


def save_image(image, image_id, label):
    if label == 1:
            cv2.imwrite(f"dataset/label_1/{image_id}.jpg", image)
    elif label == 0:
        cv2.imwrite(f"dataset/label_0/{image_id}.jpg", image)


def process_image(image):
    image = normalize_to_0_255(image)
    border_width = int(image.shape[1] * 0.1)
    left_border_std, right_border_std = borders_std(image, border_width)
    side = get_side(left_border_std, right_border_std)
    if side == "R":
        image = np.fliplr(image)
    image_border_sum, background_border_sum = image[:, 0:border_width].sum(), image[:, -border_width:-1].sum()
    image = process_background(image, image_border_sum, background_border_sum)
    image = remove_empty_background(image)
    return image


In [6]:
target_image_size = (512, 2048)  # Target image size, defined by the ratio of height to width of 1000 images after preprocessing
with tqdm(total=n_samples) as pbar:
    for image_id, file_path, label in zip(
        metadata.image_id, metadata.file_path, metadata.cancer
    ):
        image = get_image(file_path)
        image = process_image(image)
        image = resize_image(image, target_image_size)
        save_image(image, image_id, label)
        pbar.update(1)


100%|██████████| 6948/6948 [47:56<00:00,  2.42it/s]  


In [7]:
print("Done")

Done
