In [None]:
!cp /kaggle/input/nvjpeg2k/nvjpeg2k.so ./
!pip install --disable-pip-version-check /kaggle/input/rsna-2022-whl/pylibjpeg-1.4.0-py3-none-any.whl
!pip install --disable-pip-version-check /kaggle/input/rsna-2022-whl/python_gdcm-3.0.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install /kaggle/input/nvidia-dali-wheel/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl

In [None]:
import os

import cv2
import dicomsdl
import numpy as np
import nvjpeg2k
import pandas as pd
import pydicom
from tqdm.auto import tqdm
from joblib import Parallel, delayed

In [None]:
j2k_decoder = nvjpeg2k.Decoder()

csv_file = "/kaggle/input/rsna-breast-cancer-detection/train.csv"
dcm_dir = "/kaggle/input/rsna-breast-cancer-detection/train_images"

image_dir = "output/"
os.makedirs(image_dir, exist_ok=True)

In [None]:
def make_transfer_syntax_uid(df):
    machine_id_to_transfer = {}
    machine_id = df.machine_id.unique()
    for i in machine_id:
        d = df[df.machine_id == i].iloc[0]
        f = f"{dcm_dir}/{d.patient_id}/{d.image_id}.dcm"
        dicom = pydicom.dcmread(f)
        machine_id_to_transfer[i] = dicom.file_meta.TransferSyntaxUID
    return machine_id_to_transfer

In [None]:
df = pd.read_csv(csv_file)
machine_id_to_transfer = make_transfer_syntax_uid(df)
df.loc[:, "i"] = np.arange(len(df))
df.loc[:, "TransferSyntaxUID"] = df.machine_id.map(machine_id_to_transfer)

df_j2k = df[df.TransferSyntaxUID == "1.2.840.10008.1.2.4.90"].reset_index(drop=True)
df_dicomsdl = df[df.TransferSyntaxUID != "1.2.840.10008.1.2.4.90"].reset_index(drop=True)

In [None]:
def normalised_to_8bit(image, photometric_interpretation):
    xmin = image.min()
    xmax = image.max()
    norm = np.empty_like(image, dtype=np.uint8)
    dicomsdl.util.convert_to_uint8(image, norm, xmin, xmax)
    if photometric_interpretation == "MONOCHROME1":
        norm = 255 - norm
    return norm


def process_j2k(df, dcm_dir, image_dir):
    for t, d in tqdm(df.iterrows(), total=len(df), dynamic_ncols=True):
        dcm_file = f"{dcm_dir}/{d.patient_id}/{d.image_id}.dcm"
        ds = pydicom.dcmread(dcm_file)
        offset = ds.PixelData.find(b"\x00\x00\x00\x0C")
        jpeg_stream = bytearray(ds.PixelData[offset:])
        m = j2k_decoder.decode(jpeg_stream)

        # resize and save as png
        m = normalised_to_8bit(m, ds.PhotometricInterpretation)
        m = cv2.resize(m, dsize=(0, 0), fx=0.25, fy=0.25, interpolation=cv2.INTER_LINEAR)
        cv2.imwrite(os.path.join(image_dir, f"{d.patient_id}_{d.image_id}.png"), m)


def dicomsdl_parallel_process_fn(d, dcm_dir, image_dir):
    dcm_file = f"{dcm_dir}/{d.patient_id}/{d.image_id}.dcm"
    ds = dicomsdl.open(dcm_file)
    info = ds.getPixelDataInfo()
    m = np.empty(shape=[info["Rows"], info["Cols"]], dtype=info["dtype"])
    ds.copyFrameData(0, m)

    # resize and save as png
    m = normalised_to_8bit(m, ds.PhotometricInterpretation)
    m = cv2.resize(m, dsize=(0, 0), fx=0.25, fy=0.25, interpolation=cv2.INTER_LINEAR)
    cv2.imwrite(os.path.join(image_dir, f"{d.patient_id}_{d.image_id}.png"), m)

In [None]:
process_j2k(df_j2k, dcm_dir, image_dir)

In [None]:
_ = Parallel(n_jobs=2, backend='multiprocessing')(
    delayed(dicomsdl_parallel_process_fn)(d, dcm_dir, image_dir)
    for _, d in tqdm(df_dicomsdl.iterrows(), total=len(df_dicomsdl), dynamic_ncols=True)
)