In [None]:
# !pip install -qU python-gdcm pydicom pylibjpeg

In [None]:
import os
import cv2
import glob
import gdcm
import pydicom
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from collections import Counter
from tqdm.notebook import tqdm
from joblib import Parallel, delayed
# from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
TRAIN_PATH = "../input/train_images/"

print('Number of training patients :', len(os.listdir(TRAIN_PATH)))

In [None]:
def auto_windowing(img):
    """
    Applies automated windowing to the 1st and 99th percentiles.

    Args:
        img (np array): Image.

    Returns:
        np uint8 array: Windowed image.
    """
    pixels = img.flatten()
    pixels = pixels[pixels > 0]
    pixels = pixels[pixels < pixels.max()]

    start = np.percentile(pixels, 2)
    end = np.percentile(pixels, 99.8)

    img = np.clip(img, start, end)
    img = img.astype(float)

    img = img - np.min(img)
    img = img / np.max(img)
    img = (img * 255).astype(np.uint8)

    return img, (start, end)

In [None]:
df = pd.read_csv('../input/train_series_descriptions.csv')

df["orient"] = df["series_description"].apply(lambda x: x.split()[0])
df["weighting"] = df["series_description"].apply(lambda x: x.split()[1])

In [None]:
# dfg = df[["study_id", "orient", "weighting"]].groupby('study_id').agg(list)
# dfg

In [None]:
# (dfg['weighting'].apply(lambda x: len(x))).all()
# (dfg['orient'].apply(lambda x: len(x))).value_counts()

In [None]:
SAVE_FOLDER = "../input/npy/"
os.makedirs(SAVE_FOLDER, exist_ok=True)

SAVE = True
PLOT = False

In [None]:
offsets = []
for study in tqdm(sorted(os.listdir(TRAIN_PATH))):
    df_study = df[df["study_id"] == int(study)]
    for series in os.listdir(TRAIN_PATH + study):
        # if os.path.exists(SAVE_FOLDER + f"{study}_{series}.npy"):
        #     continue
        # if series != "813259065":
        #     continue

        df_series = df_study[df_study["series_id"] == int(series)]
        orient, t = df_series["series_description"].values[0].split(" ")

        folder = TRAIN_PATH + f"{study}/{series}/"
        files = os.listdir(folder)
        files.sort(key=lambda x: int(x[:-4]))

        imgs = {}
        for frame, file in enumerate(files):
            dicom = pydicom.dcmread(folder + file)

            # Retrieve frame order
            pos = int(file.split("/")[-1][:-4])
            # print(dicom[(0x20, 0x32)])
            if orient == "Axial":
                pos = -dicom[(0x20, 0x32)].value[-1]  
            else:  # Sagittal
                pos = dicom[(0x20, 0x32)].value[0]

            img = dicom.pixel_array

            if dicom.PhotometricInterpretation == "MONOCHROME1":
                print("inv")
                img = 1 - img

            try:
                _ = imgs[pos]
                print(f"Pos {pos} is already in keys")
                imgs[pos + 0.1] = img  # pos is the same, offset by 0.1
            except KeyError:
                imgs[pos] = img

        assert len(imgs) == len(files), "Missing frames!"

        order = np.argsort(list(imgs.keys()))
        offsets.append(
            {
                "study_id": study,
                "series_id": series,
                "frames": [int(f[:-4]) for f in np.array(files)[order]],
            }
        )

        try:
            imgs = np.array([img for k, img in sorted(imgs.items())])
        except:
            imgs = [img for k, img in sorted(imgs.items())]

            shapes = Counter([img.shape for img in imgs])
            shape = shapes.most_common()[0][0]
            print("Different shapes:", shapes, f"resize to {shape}")

            imgs = np.array(
                [cv2.resize(img, shape) if img.shape != shape else img for img in imgs]
            )

        if SAVE:
            np.save(SAVE_FOLDER + f"{study}_{series}.npy", imgs)

        if PLOT:
            for i in range(len(imgs)):
                img = imgs[i]
                # if not (i % (len(imgs) // 5 + 1)):

                if i > 40:
                    plt.figure(figsize=(5, 5))
                    plt.imshow(img, cmap="gray")
                    plt.title(
                        f"Study {study} - Series {series} - Frame {i}/{len(imgs)} - Shape {img.shape}"
                    )
                    plt.show()

            # break

    if PLOT:
        break

In [None]:
df_frames = pd.DataFrame(offsets)
df_frames.to_csv('../input/df_frames.csv', index=False)

In [None]:
# folder = "../input/train_images/1644528034/2456924377/"
# folder = "../input/train_images/2794192602/2916494105/"
# folder = "../input/train_images/2794192602/1829533928/"
# folder = "../input/train_images/2794192602/1843512620/"
# folder = "../input/train_images/3303545110/304087230/"
# folder = "../input/train_images/642715533/3541751675/"

# files = os.listdir(folder)
# files.sort(key=lambda x:int(x[:-4]))
# files

Done ! 