In [1]:
# %%capture
# Source: https://www.kaggle.com/code/remekkinas/fast-dicom-processing-1-6-2x-faster?scriptVersionId=113360473
!pip install /kaggle/input/rsnamodules/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl 

try:
    import pylibjpeg
except:
   !pip install /kaggle/input/rsna-2022-whl/{pylibjpeg-1.4.0-py3-none-any.whl,python_gdcm-3.0.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl}

Processing /kaggle/input/rsnamodules/dicomsdl-0.109.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: dicomsdl
Successfully installed dicomsdl-0.109.1
[0mProcessing /kaggle/input/rsna-2022-whl/pylibjpeg-1.4.0-py3-none-any.whl
Processing /kaggle/input/rsna-2022-whl/python_gdcm-3.0.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: python-gdcm, pylibjpeg
Successfully installed pylibjpeg-1.4.0 python-gdcm-3.0.15
[0m

In [2]:
# Install Tensorflow Keras EfficientNetV2
!pip install --no-deps  /kaggle/input/kerasefficientnetv2/keras_efficientnet_v2-1.2.2-py3-none-any.whl

Processing /kaggle/input/kerasefficientnetv2/keras_efficientnet_v2-1.2.2-py3-none-any.whl
Installing collected packages: keras-efficientnet-v2
Successfully installed keras-efficientnet-v2-1.2.2
[0m

In [3]:
import numpy as np
import pandas as pd
import pylibjpeg
import pydicom
import matplotlib as mpl
import matplotlib.pyplot as plt
import tensorflow as tf
import dicomsdl as dicoml
import pydicom

from joblib import Parallel, delayed
from tqdm.notebook import tqdm
from multiprocessing import cpu_count

import keras_efficientnet_v2
import cv2
import glob
import importlib
import os
import joblib
import time

# Tensorflow and CV2 set number of threads to 1 for speedup in parallell function mapping
tf.config.threading.set_inter_op_parallelism_threads(num_threads=1)
cv2.setNumThreads(1)

# Pandas DataFrame Display Options
pd.options.display.max_colwidth = 99

In [4]:
IS_INTERACTIVE = os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive'

TARGET_HEIGHT = 1344
TARGET_WIDTH = 768
N_CHANNELS = 1
INPUT_SHAPE = (TARGET_HEIGHT, TARGET_WIDTH, N_CHANNELS)
TARGET_HEIGHT_WIDTH_RATIO = TARGET_HEIGHT / TARGET_WIDTH
THRESHOLD_BEST = 0.5
CLAHE = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(32, 32))

In [5]:
def smooth(l):
    kernel_size = int(len(l) * 0.01)
    kernel = np.ones(kernel_size) / kernel_size
    return np.convolve(l, kernel, mode='same')

def get_x_offset(image, max_col_sum_ratio_threshold=0.05):
    margin = 0
    sums = smooth(image.sum(axis=0).squeeze())
    sums_argmax = sums[:int(image.shape[1] * 0.75)].argmax()
    sums_threshold = sums.max() * max_col_sum_ratio_threshold
    first_non_zoro_column_found = False
    
    for offset, s in enumerate(sums):
        if s < sums_threshold and first_non_zoro_column_found:
            return min(image.shape[1], offset + margin)
        elif s > sums_threshold and offset > sums_argmax:
            first_non_zoro_column_found = True
        
    return offset

def get_y_offsets(image, max_row_sum_ratio_threshold=0.10):
    margin = 0
    sums = smooth(image.sum(axis=1).squeeze())
    sums_argmax = int(image.shape[0] * 0.25) + sums[int(image.shape[0] * 0.25):int(image.shape[0] * 0.75)].argmax()
    sum_threshold = sums.max() * max_row_sum_ratio_threshold
    offset_bottom = 0
    offset_top = image.shape[0]
    offset_top_set = False

    # Bottom offset
    for offset, s in enumerate(sums):
        if s < sum_threshold and not offset_top_set:
            offset_bottom += 1
        else:
            break
            
    for offset, s in enumerate(reversed(sums)):
        if s > sum_threshold and not offset_top_set:
            offset_top = image.shape[0] - (offset + 1)
            break
            
    return max(0, offset_bottom - margin), min(image.shape[0], offset_top + margin)

def crop(image, debug=False):
    x_offset = get_x_offset(image)
    offset_bottom, offset_top = get_y_offsets(image[:,:x_offset])
    
    image = image[offset_bottom:offset_top:,:x_offset]
        
    return image

In [6]:
def process(file_path, size=None, crop_image=False, apply_clahe=False, debug=False, save=False):
    # Read Dicom File
    dicom = pydicom.dcmread(file_path)
    image = dicom.pixel_array

    # Normalize [0,1] range
    image = (image - image.min()) / (image.max() - image.min())

    if dicom.PhotometricInterpretation == "MONOCHROME1":  
        image = 1 - image

    # Convert to uint8 image in range [0, 255]
    image = (image * 255).astype(np.uint8)
    
    # Flip T0 Left/Right Orientation
    h0, w0 = image.shape
    if image[:,int(-w0 * 0.10):].sum() > image[:,:int(w0 * 0.10)].sum():
        image = np.flip(image, axis=1)
    
    # Save original image
    if debug:
        image0 = np.copy(image)
    
    # Always crop 10 pixels for weird border noise/lines
    image = image[int(h0 * 2e-2):-int(h0 * 2e-2),int(w0 * 2e-2):-int(w0 * 2e-2)]
    
    if crop_image:
        image = crop(image, debug=debug)
        
    # Resize
    if size is not None:
        # Pad black pixels to make square image
        h, w = image.shape
        if (h / w) > TARGET_HEIGHT_WIDTH_RATIO:
            pad = int(h / TARGET_HEIGHT_WIDTH_RATIO - w)
            image = np.pad(image, [[0,0], [0, pad]])
            h, w = image.shape
        else:
            pad = int(0.50 * (w * TARGET_HEIGHT_WIDTH_RATIO - h))
            image = np.pad(image, [[pad, pad], [0,0]])
            h, w = image.shape
        # Resize
        image = cv2.resize(image, size, interpolation=cv2.INTER_AREA)
        
    # Apply CLAHE contrast enhancement
    if apply_clahe:
        image = CLAHE.apply(image)
        
    # Save Only
    if save:
        image_id = file_path.split('/')[-1].split('.')[0]
        cv2.imwrite(f'{image_id}.png', image)

In [7]:
def normalize(image):
    image = tf.repeat(image, repeats=3, axis=3)
    image = tf.cast(image, tf.float32)
    image = tf.keras.applications.imagenet_utils.preprocess_input(image, mode='torch')

    return image

In [8]:
def get_model():
    # Inputs, note the names are equal to the dictionary keys in the dataset
    image = tf.keras.layers.Input(INPUT_SHAPE, name='image', dtype=tf.uint8)

    # Normalize Input
    image_norm = normalize(image)

    # CNN Prediction
    outputs = keras_efficientnet_v2.EfficientNetV2T(
        input_shape=[TARGET_HEIGHT, TARGET_WIDTH, 3],
        pretrained=None,
        num_classes=1,
        classifier_activation='sigmoid',
        dropout=0.30,
    )(image_norm)

    model = tf.keras.models.Model(inputs=image, outputs=outputs)

    #model.load_weights('/kaggle/input/rsna-efficientnetv2-training-tensorflow-tpu-ds/model.h5')
    
    #model.load_weights('/kaggle/input/my-rsna-effnetv2t-model/model.h5')
    
    #model.load_weights('/kaggle/input/my-tuned-rsna-effnetv2t/model.h5')
    
    model.load_weights('/kaggle/input/my-balanced-rsna-effnetv2t-wts/model.h5')

    model.trainable = False

    model.compile()

    return model

In [9]:
# Pretrained File Path: '/kaggle/input/sartorius-training-dataset/model.h5'
tf.keras.backend.clear_session()
# enable XLA optmizations
tf.config.optimizer.set_jit(True)

model = get_model()

  f"The initializer {self.__class__.__name__} is unseeded "


>>>> No pretrained available, model will be randomly initialized


In [10]:
# Plot model summary
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 image (InputLayer)          [(None, 1344, 768, 1)]    0         
                                                                 
 tf.repeat (TFOpLambda)      (None, 1344, 768, 3)      0         
                                                                 
 tf.cast (TFOpLambda)        (None, 1344, 768, 3)      0         
                                                                 
 tf.math.truediv (TFOpLambda  (None, 1344, 768, 3)     0         
 )                                                               
                                                                 
 tf.nn.bias_add (TFOpLambda)  (None, 1344, 768, 3)     0         
                                                                 
 tf.math.truediv_1 (TFOpLamb  (None, 1344, 768, 3)     0         
 da)                                                         

In [11]:
# Model is not trainable
# model.fit()

In [12]:
test = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/test.csv')

def get_file_path(args):
    patient_id, image_id = args
    return f'/kaggle/input/rsna-breast-cancer-detection/test_images/{patient_id}/{image_id}.dcm'
    
test['file_path'] = test[['patient_id', 'image_id']].apply(get_file_path, axis=1)

display(test.info())
display(test.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   site_id        4 non-null      int64 
 1   patient_id     4 non-null      int64 
 2   image_id       4 non-null      int64 
 3   laterality     4 non-null      object
 4   view           4 non-null      object
 5   age            4 non-null      int64 
 6   implant        4 non-null      int64 
 7   machine_id     4 non-null      int64 
 8   prediction_id  4 non-null      object
 9   file_path      4 non-null      object
dtypes: int64(6), object(4)
memory usage: 448.0+ bytes


None

Unnamed: 0,site_id,patient_id,image_id,laterality,view,age,implant,machine_id,prediction_id,file_path
0,2,10008,736471439,L,MLO,81,0,21,10008_L,/kaggle/input/rsna-breast-cancer-detection/test_images/10008/736471439.dcm
1,2,10008,1591370361,L,CC,81,0,21,10008_L,/kaggle/input/rsna-breast-cancer-detection/test_images/10008/1591370361.dcm
2,2,10008,68070693,R,MLO,81,0,21,10008_R,/kaggle/input/rsna-breast-cancer-detection/test_images/10008/68070693.dcm
3,2,10008,361203119,R,CC,81,0,21,10008_R,/kaggle/input/rsna-breast-cancer-detection/test_images/10008/361203119.dcm


In [13]:
sample_submission = pd.read_csv('/kaggle/input/rsna-breast-cancer-detection/sample_submission.csv')

display(sample_submission.info())
display(sample_submission.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   prediction_id  2 non-null      object 
 1   cancer         2 non-null      float64
dtypes: float64(1), object(1)
memory usage: 160.0+ bytes


None

Unnamed: 0,prediction_id,cancer
0,10008_L,0.021168
1,10008_R,0.021168


In [14]:
# Preprocess a single image and saves it
def preprocess_and_save_image(args):
    (patient_id, laterality), g = args
    cancer = 0.0
    for row_idx, row in g.iterrows():
        process(row['file_path'], size=(TARGET_WIDTH, TARGET_HEIGHT), crop_image=True, save=True)

In [15]:
# Preprocess all images in parallel using Joblib
jobs = [joblib.delayed(preprocess_and_save_image)(args) for args in test.groupby(['patient_id', 'laterality'])]
SUBMISSION_ROWS = joblib.Parallel(
    n_jobs=cpu_count(),
    verbose=1,
    backend='multiprocessing',
    prefer='threads',
)(jobs)

[Parallel(n_jobs=2)]: Using backend MultiprocessingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    2.5s remaining:    0.0s
[Parallel(n_jobs=2)]: Done   2 out of   2 | elapsed:    2.5s finished


In [16]:
SUBMISSION_ROWS = []

for (patient_id, laterality), g in tqdm(test.groupby(['patient_id', 'laterality'])):
    cancer = 0
    for row_idx, row in g.iterrows():
        # Load Image
        image_id = row['image_id']
        image = cv2.imread(f'{image_id}.png', -1)
        # Expand to Batch HxW -> 1xHxWx1
        image = np.expand_dims(image, [0, 3])
        # Make Prediction
        cancer += model.predict_on_batch(image).squeeze() / len(g)
        # Remove Image PNG
        os.remove(f'{image_id}.png')
        
    # Add Submission Row
    SUBMISSION_ROWS.append({
        'prediction_id': f'{patient_id}_{laterality}',
        #'cancer': np.int8(cancer > THRESHOLD_BEST),
        'cancer': cancer,
    })

  0%|          | 0/2 [00:00<?, ?it/s]

In [17]:
# Create DataFrame from submission rows
submission_df = pd.DataFrame(SUBMISSION_ROWS)

display(submission_df.info())
display(submission_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   prediction_id  2 non-null      object 
 1   cancer         2 non-null      float64
dtypes: float64(1), object(1)
memory usage: 160.0+ bytes


None

Unnamed: 0,prediction_id,cancer
0,10008_L,0.003288
1,10008_R,0.032988


In [18]:
submission_df.to_csv('submission.csv', columns=['prediction_id','cancer'],index=False)

In [19]:
# Sanity Check
display(pd.read_csv('submission.csv').head())

Unnamed: 0,prediction_id,cancer
0,10008_L,0.003288
1,10008_R,0.032988
