**About** : This notebook is used to prepare the data.

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
cd ../src/

## Initialization

### Imports

In [None]:
import os
import re
import sys
import cv2
import glob
import json
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.linear_model import *

pd.set_option('display.width', 500)
pd.set_option('max_colwidth', 100)

### Smart cropping

In [None]:
import numpy as np
import skimage.measure

from data.preparation import *
from params import *

In [None]:
df_patient, df_img = prepare_data(DATA_PATH)

In [None]:
df_img = df_img[['patient_id', 'series', 'frame']].groupby(['patient_id', 'series']).max().reset_index()

In [None]:
df_img['path'] = "../input/imgs/" + df_img['patient_id'].astype(str) + "_" + df_img['series'].astype(str) + "_" + df_img['frame'].apply(lambda x: f"{x // 2:04d}") + ".png"

In [None]:
def smart_crop(img, margin=10, max_sz=None):
    img_nosat = np.where(img > 200, 0, img)

    x = skimage.measure.block_reduce((img_nosat > 100).sum(1), (10), np.min)
    x = x > (x.max() * 0.1)
    x_start = np.argmax(x)
    x_end = len(x) - np.argmax(x[::-1])
    x_start = x_start * 10 - margin
    x_end = (x_end + 1) * 10 + margin

    y = skimage.measure.block_reduce((img_nosat > 100).sum(0), (10), np.min)
    y = y > (y.max() * 0.1)
    y_start = np.argmax(y)
    y_end = len(y) - np.argmax(y[::-1])
    y_start = y_start * 10 - margin
    y_end = (y_end + 1) * 10 + margin
    
    x_start = max(0, x_start)
    x_end = min(x_end, img.shape[0])
    y_start = max(0, y_start)
    y_end = min(y_end, img.shape[1])
    
#     print(x_start, x_end, y_start, y_end)
    if max_sz is not None:
        if y_end - y_start > max_sz:
            y_mid = (y_end + y_start) // 2
            y_start, y_end = y_mid - max_sz // 2, y_mid + max_sz // 2
        if x_end - x_start > max_sz:
            x_mid = (x_end + x_start) // 2
            x_start, x_end = x_mid - max_sz // 2, x_mid + max_sz // 2
        
    x_start = max(0, x_start)
    x_end = min(x_end, img.shape[0])
    y_start = max(0, y_start)
    y_end = min(y_end, img.shape[1])
    
    return img[x_start: x_end, y_start: y_end], (x_start, x_end, y_start, y_end)

In [None]:
def to_size(coords, img, sz=384):
    x_start, x_end, y_start, y_end = coords
    
    xc = (x_start + x_end) // 2
    yc = (y_start + y_end) // 2
    
    x_start, x_end = xc - sz // 2, xc + sz // 2
    y_start, y_end = yc - sz // 2, yc + sz // 2
    
    if x_start < 0:
        x_end = sz
        x_start = 0
    elif x_start > img.shape[0]:
        x_start = img.shape[0] - sz
        x_end = img.shape[0]
    if y_start < 0:
        y_end = sz
        y_start = 0
    elif y_start > img.shape[1]:
        y_start = img.shape[1] - sz
        y_end = img.shape[1]
    
    return img[x_start: x_end, y_start: y_end], (x_start, x_end, y_start, y_end)

In [None]:
MAX_SIZE = 384
SIZE = 384

In [None]:
# shapes = []
# crops = []
# for i in tqdm(range(len(df_img))):
# #     i = 26
#     img = cv2.imread(df_img['path'][i], 0)
        
# #     if img.shape == (512, 512):
# #         continue
# #     if img.shape[1] == 512:
# #         continue
#     if img.shape[0] <= img.shape[1]:
#         continue
# #     if img.shape[0] != 512 or np.random.random() < 0.01:

#     ref_size = 512
#     if img.shape[1] != ref_size:
#         h = int(ref_size / img.shape[1] * img.shape[0])
#         img = cv2.resize(img, (ref_size, h))
#     else:
#         pass

#     img_c, crop = smart_crop(img, max_sz=int(MAX_SIZE / 512 * img.shape[1]))
#     img_c, crop = to_size(crop, img, sz=SIZE)
    
#     img_c_sz = center_crop_pad(img[None])[0]
# #     img_c_sz = cv2.resize(img_c, (SIZE, SIZE))
    
#     crops.append(crop)
#     shapes.append(img_c.shape)

#     if not (i % 1):
#         plt.figure(figsize=(15, 5))
#         plt.subplot(1, 3, 1)
#         plt.imshow(img, cmap="gray")
#         plt.axis(False)
#         plt.title(img.shape)
#         plt.subplot(1, 3, 2)
#         plt.imshow(img_c, cmap="gray")
#         plt.title(f'Better Crop - {img_c.shape}')
#         plt.axis(False)
#         plt.subplot(1, 3, 3)
#         plt.imshow(img_c_sz, cmap="gray")
#         plt.title(f'Center Crop - {img_c_sz.shape}')
#         plt.axis(False)
#         plt.show()
        
# #         break
        
# #         print(df_img.loc[[i]])
        
# #         break
        
# #     if i > 100:
# #         break


In [None]:
# crops = np.array(crops)

# df_img['x_start'] = crops[:, 0]
# df_img['x_end'] = crops[:, 1]
# df_img['y_start'] = crops[:, 2]
# df_img['y_end'] = crops[:, 3]

# df_img.to_csv('../input/df_crops.csv', index=False)

In [None]:
# shapes = np.array(shapes)
# plt.subplot(1, 2, 1)
# sns.histplot(x=shapes[:, 0])
# plt.subplot(1, 2, 2)
# sns.histplot(x=shapes[:, 1])
# plt.show()

### Crop with segs

In [None]:
from util.plots import plot_mask
from matplotlib.patches import Rectangle

In [None]:
MASK_FOLDER = "../logs/2023-09-24/20/masks/"
IMG_PATH = "../input/imgs/"

MAX_LEN = 600
MARGIN = 5

PLOT = True
SAVE = False

In [None]:
df_series = df_img[['patient_id', "series", "frame"]].groupby(['patient_id', "series"]).max().reset_index()

In [None]:
def get_start_end(x):
    return np.argmax(x), len(x) - np.argmax(x[::-1])

In [None]:
SAVE_FOLDER = "../input/crops/"
SAVE_FOLDER_IMG = SAVE_FOLDER + "imgs_bowel/"
SAVE_FOLDER_MASK = SAVE_FOLDER + "masks_bowel/"

os.makedirs(SAVE_FOLDER, exist_ok=True)
os.makedirs(SAVE_FOLDER_IMG, exist_ok=True)
os.makedirs(SAVE_FOLDER_MASK, exist_ok=True)

In [None]:
# for i in tqdm(range(len(df_series))):
#     mask_path = f'mask_{df_series.patient_id[i]}_{df_series.series[i]}.npy'
#     seg = np.load(MASK_FOLDER + mask_path)

#     imgs = np.concatenate([
#          cv2.imread(
#             IMG_PATH + f'{df_series.patient_id[i]}_{df_series.series[i]}_{f:04d}.png'
#         ) for f in range(0, df_series['frame'][i], 3)
#     ], -1).transpose(2, 0, 1)[-MAX_LEN:]

#     liver = (seg == 1).astype(np.uint8)
#     spleen = (seg == 2).astype(np.uint8)
#     kidney = (seg == 3).astype(np.uint8)
    
#     x0_liver, x1_liver = get_start_end(liver.sum((1, 2)) > 400)
#     y0_liver, y1_liver = get_start_end(liver.sum((0, 2)) > 400)
#     z0_liver, z1_liver = get_start_end(liver.sum((0, 1)) > 400)
    
#     x0_spleen, x1_spleen = get_start_end(spleen.sum((1, 2)) > 100)
#     y0_spleen, y1_spleen = get_start_end(spleen.sum((0, 2)) > 100)
#     z0_spleen, z1_spleen = get_start_end(spleen.sum((0, 1)) > 100)
    
#     x0_kidney, x1_kidney = get_start_end(kidney.sum((1, 2)) > 100)
#     y0_kidney, y1_kidney = get_start_end(kidney.sum((0, 2)) > 100)
#     z0_kidney, z1_kidney = get_start_end(kidney.sum((0, 1)) > 100)
    
#     x0s = [x0_liver, x0_spleen, x0_kidney]
#     x1s = [x1_liver, x1_spleen, x1_kidney]
#     y0s = [y0_liver, y0_spleen, y0_kidney]
#     y1s = [y1_liver, y1_spleen, y1_kidney]
#     z0s = [z0_liver, z0_spleen, z0_kidney]
#     z1s = [z1_liver, z1_spleen, z1_kidney]
#     cs = ["skyblue", "salmon", "lightgreen"]
    
#     for x0, x1, y0, y1, z0, z1, name in zip(x0s, x1s, y0s, y1s, z0s, z1s, ['liver', 'spleen', 'kidney']):
#         x0, x1 = max(0, x0 - MARGIN), min(imgs.shape[0], x1 + MARGIN)
#         y0, y1 = max(0, y0 - MARGIN), min(imgs.shape[1], y1 + MARGIN)
#         z0, z1 = max(0, z0 - MARGIN), min(imgs.shape[2], z1 + MARGIN)
        
#         img_crop = imgs[x0: x1, y0:y1, z0:z1]
#         seg_crop = seg[x0: x1, y0:y1, z0:z1]
        
#         if PLOT or not (i % 1000):
#             plt.figure(figsize=(15, 5))
#             plt.subplot(1, 3, 1)
#             id_ = img_crop.shape[0] // 2
#             plot_mask(img_crop[id_], seg_crop[id_])
#             plt.title(f'{name} x - shape={img_crop.shape}')
#             plt.subplot(1, 3, 2)
#             id_ = img_crop.shape[1] // 2
#             plot_mask(img_crop[:, id_], seg_crop[:, id_])
#             plt.title(f'{name} y - shape={img_crop.shape}')
#             plt.subplot(1, 3, 3)
#             id_ = img_crop.shape[2] // 2
#             plot_mask(img_crop[:, :, id_], seg_crop[:, :, id_])
#             plt.title(f'{name} z - shape={img_crop.shape}')
#             plt.show()

#         if SAVE:
#             np.save(SAVE_FOLDER_IMG + f'{df_series.patient_id[i]}_{df_series.series[i]}_{name}.npy', img_crop)
#             np.save(SAVE_FOLDER_MASK + f'{df_series.patient_id[i]}_{df_series.series[i]}_{name}.npy', seg_crop)

#     if PLOT or not (i % 1000):
#         ids = np.linspace(x0 + 5, x1 - 5, 5, dtype=int)
#         plt.figure(figsize=(20, 5))
#         for i, id_ in enumerate(ids):
#             plt.subplot(1, len(ids), i + 1)
#             plot_mask(imgs[id_], seg[id_])
#             plt.title(f'Frame {id_}')
            
#             for x0, x1, y0, y1, z0, z1, col in zip(x0s, x1s, y0s, y1s, z0s, z1s, cs):
#                 rect = Rectangle(
#                     (z0, y0),
#                     z1 - z0,
#                     y1 - y0,
#                     linewidth=2,
#                     facecolor="none",
#                     edgecolor=col,
#                 )
#                 if id_ > x0 and id_ < x1:
#                     plt.gca().add_patch(rect)
#         plt.show()
        
#         ids = np.linspace(y0 + 5, y1 - 5, 5, dtype=int)
#         plt.figure(figsize=(20, 5))
#         for i, id_ in enumerate(ids):
#             plt.subplot(1, len(ids), i + 1)
#             plot_mask(imgs[:, id_], seg[:, id_])
#             plt.title(f'Frame {id_}')
            
#             for x0, x1, y0, y1, z0, z1, col in zip(x0s, x1s, y0s, y1s, z0s, z1s, cs):
#                 rect = Rectangle(
#                     (z0, x0),
#                     z1 - z0,
#                     x1 - x0,
#                     linewidth=2,
#                     facecolor="none",
#                     edgecolor=col,
#                 )
#                 if id_ > y0 and id_ < y1:
#                     plt.gca().add_patch(rect)
#         plt.show()
        
#         ids = np.linspace(z0 + 5, z1 - 5, 5, dtype=int)
#         plt.figure(figsize=(20, 5))
#         for i, id_ in enumerate(ids):
#             plt.subplot(1, len(ids), i + 1)
#             plot_mask(imgs[:, :, id_], seg[:, :, id_])
#             plt.title(f'Frame {id_}')
#             for x0, x1, y0, y1, z0, z1, col in zip(x0s, x1s, y0s, y1s, z0s, z1s, cs):
#                 rect = Rectangle(
#                     (y0, x0),
#                     y1 - y0,
#                     x1 - x0,
#                     linewidth=2,
#                     facecolor="none",
#                     edgecolor=col,
#                 )
#                 if id_ > z0 and id_ < z1:
#                     plt.gca().add_patch(rect)
#         plt.show()
        
# #     break

In [None]:
for i in tqdm(range(len(df_series))):
    mask_path = f'mask_{df_series.patient_id[i]}_{df_series.series[i]}.npy'
    seg = np.load(MASK_FOLDER + mask_path)

    imgs = np.array([
         cv2.imread(
            IMG_PATH + f'{df_series.patient_id[i]}_{df_series.series[i]}_{f:04d}.png', 0
        ) for f in range(df_series['frame'][i])
    ])[-MAX_LEN:]
    imgs = center_crop_pad(imgs)

    bowel = (seg == 4).astype(np.uint8)
    
    x0_bowel, x1_bowel = get_start_end(bowel.sum((1, 2)) > 400)
    y0_bowel, y1_bowel = get_start_end(bowel.sum((0, 2)) > 400)
    z0_bowel, z1_bowel = get_start_end(bowel.sum((0, 1)) > 400)
    
    x0s = [x0_bowel]
    x1s = [x1_bowel]
    y0s = [y0_bowel]
    y1s = [y1_bowel]
    z0s = [z0_bowel]
    z1s = [z1_bowel]
    cs = ["skyblue"]
    
    for x0, x1, y0, y1, z0, z1, name in zip(x0s, x1s, y0s, y1s, z0s, z1s, ['bowel']):
        x0, x1 = max(0, x0 - MARGIN), min(imgs.shape[0], x1 + MARGIN)
        y0, y1 = max(0, y0 - MARGIN), min(imgs.shape[1], y1 + MARGIN)
        z0, z1 = max(0, z0 - MARGIN), min(imgs.shape[2], z1 + MARGIN)
        
        img_crop = imgs[x0: x1, y0:y1, z0:z1]
        seg_crop = seg[x0: x1, y0:y1, z0:z1]
        
        if PLOT or not (i % 1000):
            plt.figure(figsize=(15, 5))
            plt.subplot(1, 3, 1)
            id_ = img_crop.shape[0] // 2
            plot_mask(img_crop[id_], seg_crop[id_])
            plt.title(f'{name} x - shape={img_crop.shape}')
            plt.subplot(1, 3, 2)
            id_ = img_crop.shape[1] // 2
            plot_mask(img_crop[:, id_], seg_crop[:, id_])
            plt.title(f'{name} y - shape={img_crop.shape}')
            plt.subplot(1, 3, 3)
            id_ = img_crop.shape[2] // 2
            plot_mask(img_crop[:, :, id_], seg_crop[:, :, id_])
            plt.title(f'{name} z - shape={img_crop.shape}')
            plt.show()

        if SAVE:
            np.save(SAVE_FOLDER_IMG + f'{df_series.patient_id[i]}_{df_series.series[i]}_{name}.npy', img_crop)
            np.save(SAVE_FOLDER_MASK + f'{df_series.patient_id[i]}_{df_series.series[i]}_{name}.npy', seg_crop)

    if PLOT or not (i % 1000):
        ids = np.linspace(x0 + 5, x1 - 5, 5, dtype=int)
        plt.figure(figsize=(20, 5))
        for i, id_ in enumerate(ids):
            plt.subplot(1, len(ids), i + 1)
            plot_mask(imgs[id_], seg[id_])
            plt.title(f'Frame {id_}')
            
            for x0, x1, y0, y1, z0, z1, col in zip(x0s, x1s, y0s, y1s, z0s, z1s, cs):
                rect = Rectangle(
                    (z0, y0),
                    z1 - z0,
                    y1 - y0,
                    linewidth=2,
                    facecolor="none",
                    edgecolor=col,
                )
                if id_ > x0 and id_ < x1:
                    plt.gca().add_patch(rect)
        plt.show()
        
        ids = np.linspace(y0 + 5, y1 - 5, 5, dtype=int)
        plt.figure(figsize=(20, 5))
        for i, id_ in enumerate(ids):
            plt.subplot(1, len(ids), i + 1)
            plot_mask(imgs[:, id_], seg[:, id_])
            plt.title(f'Frame {id_}')
            
            for x0, x1, y0, y1, z0, z1, col in zip(x0s, x1s, y0s, y1s, z0s, z1s, cs):
                rect = Rectangle(
                    (z0, x0),
                    z1 - z0,
                    x1 - x0,
                    linewidth=2,
                    facecolor="none",
                    edgecolor=col,
                )
                if id_ > y0 and id_ < y1:
                    plt.gca().add_patch(rect)
        plt.show()
        
        ids = np.linspace(z0 + 5, z1 - 5, 5, dtype=int)
        plt.figure(figsize=(20, 5))
        for i, id_ in enumerate(ids):
            plt.subplot(1, len(ids), i + 1)
            plot_mask(imgs[:, :, id_], seg[:, :, id_])
            plt.title(f'Frame {id_}')
            for x0, x1, y0, y1, z0, z1, col in zip(x0s, x1s, y0s, y1s, z0s, z1s, cs):
                rect = Rectangle(
                    (y0, x0),
                    y1 - y0,
                    x1 - x0,
                    linewidth=2,
                    facecolor="none",
                    edgecolor=col,
                )
                if id_ > z0 and id_ < z1:
                    plt.gca().add_patch(rect)
        plt.show()
        
    break

## Data

### Imgs

In [None]:
files = os.listdir('../input/imgs/')

In [None]:
df = pd.DataFrame(files)
df.columns = ['file']

In [None]:
df['patient'] = df['file'].apply(lambda x: x.split('_')[0])
df['series'] = df['file'].apply(lambda x: x.split('_')[1])
df['frame'] = df['file'].apply(lambda x: x.split('_')[2][:-4])

df['path'] = '../input/imgs/' + df['file']

### Tags

In [None]:
tags = pd.read_parquet("../input/train_dicom_tags.parquet")

In [None]:
tags['z'] = tags['ImagePositionPatient'].apply(lambda x: float(x[:-1].split(', ')[-1]))

In [None]:
tags = tags[["path", "z"]].copy()

In [None]:
tags['patient'] = tags['path'].apply(lambda x: x.split('/')[1])
tags['series'] = tags['path'].apply(lambda x: x.split('/')[2])
tags['instance'] = tags['path'].apply(lambda x: x.split('/')[3][:-4])

In [None]:
tags = tags.sort_values(['patient', 'series', 'z'], ignore_index=True)

In [None]:
tags['frame'] = tags.groupby(['patient', 'series']).agg('rank')['z'].astype(int)
tags['frame'] -= 1

In [None]:
tags['frame'] = tags['frame'].apply(lambda x: f'{x:04d}')

In [None]:
tags.to_csv('../input/frame_mapping.csv', index=False)

In [None]:
tags.head()

### Merge

In [None]:
df = df.merge(tags[['patient', 'series', 'frame', 'instance']], how="left")
df = df.sort_values(['patient', 'series', 'frame'], ignore_index=True)

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
image_level = pd.read_csv('../input/image_level_labels.csv')

In [None]:
image_level.columns = ['patient', 'series', 'instance', 'injury_name']
image_level[image_level.columns] = image_level[image_level.columns].astype(str)

In [None]:
image_level = image_level.groupby(['patient', 'series', 'instance']).agg(list).reset_index()

In [None]:
image_level.head(1)

In [None]:
dfm = df.merge(image_level, on=['patient', 'series', 'instance'], how="left")

In [None]:
dfm["injury_name"] = dfm["injury_name"].fillna('').astype(str)
dfm["extravasation_injury"] = dfm["injury_name"].apply(lambda x: "Active_Extravasation" in x).astype(np.uint8)
dfm["bowel_injury"] = dfm["injury_name"].apply(lambda x: "Bowel" in x).astype(np.uint8)

In [None]:
dfm = dfm[['patient', 'series', 'instance', "frame", 'extravasation_injury', 'bowel_injury', 'path']]
dfm.to_csv('../input/df_images_train.csv', index=False)

In [None]:
dfm.head()

### Target EDA

In [None]:
train = pd.read_csv('../input/train.csv')

In [None]:
plt.figure(figsize=(10, 3))

for i, k in enumerate(['bowel_injury', 'extravasation_injury']):
    plt.subplot(1, 2, i + 1)
    sns.countplot(x=train[k])
    plt.yscale('log')
    plt.xticks([0, 1], ['healthy', 'injured'])
    
plt.show()

In [None]:
plt.figure(figsize=(15, 3))

for i, k in enumerate(['kidney', 'liver', 'spleen']):
    plt.subplot(1, 3, i + 1)
    train[k] = train[f'{k}_low'] + 2 * train[f'{k}_high']
    sns.countplot(x=train[k])
    plt.yscale('log')
    plt.xticks([0, 1, 2], ['healthy', 'low','high'])
    
plt.show()

In [None]:
healthy = (
    (train['kidney'] == 0) &
    (train['liver'] == 0) &
    (train['spleen'] == 0)&
    (train['bowel_injury'] == 0) &
    (train['extravasation_injury'] == 0)
)

(train['any_injury'] == ~healthy).all()

plt.figure(figsize=(5, 3))
sns.countplot(x=train['any_injury'])
plt.show()

In [None]:
def get_weight(row):
    

In [None]:
tgts = ['kidney', 'liver', 'spleen', 'bowel_injury', 'extravasation_injury']

for i, t1 in enumerate(tgts):
    for t2 in tgts[i + 1:]:
        print(
            f'{t1.split("_")[0][:6]}\t:', (train[t1] > 0).sum(),
            f' \t{t2.split("_")[0][:6]}\t:', (train[t2] > 0).sum(),
            f' \t{t1.split("_")[0][:6]} & {t2.split("_")[0][:6]}\t:', ((train[t1] > 0) & (train[t2] > 0)).sum()
        )

### Metric

In [None]:
from sklearn.metrics import log_loss
from util.metrics import *

In [None]:
log_loss([2, 0, 0, 1], [[.1, 0, .9], [.9, .1, 0], [.8, .2, 0], [.35, .65, 0]])

In [None]:
from sklearn.metrics import log_loss
log_loss([0, 0, 0, 1], [[1, 0,], [.9, .1], [.8, .2], [.35, .65]], labels=[0, 1])

In [None]:
from sklearn.metrics import log_loss
log_loss([0, 0, 0, 1], [0, .1, .2, .65], labels=[0, 1])

In [None]:
from sklearn.metrics import log_loss
log_loss([[0, 0, 1], [1, 0, 0], [1, 0, 0], [0, 1, 0]],  [[.1, 0, .9], [.9, .1, 0], [.8, .2, 0], [.35, .65, 0]])

In [None]:
preds = [
    np.random.random((5, 1)),
    np.random.random((5, 1)),
    np.random.random((5, 3)),
    np.random.random((5, 3)),
    np.random.random((5, 3)),
]

In [None]:
preds

In [None]:
losses, avg_loss = rsna_loss(preds, train.head(5))

In [None]:
losses, avg_loss

In [None]:
train.to_csv('../input/df_train.csv', index=False)

In [None]:
train.head()

Done ! 