# FTUs⚕️Segm: EDA🔎 & viewer

In [None]:
!pip wheel -q "https://github.com/Borda/kaggle_image-segm/archive/refs/heads/main.zip" --wheel-dir frozen_packages
!pip wheel -q "https://github.com/PyTorchLightning/lightning-flash/archive/refs/heads/segm/multi-label.zip" --wheel-dir frozen_packages
!rm frozen_packages/torch*
!ls -l frozen_packages | grep -e kaggle -e lightning
!pip install -q 'kaggle-image-segmentation' --find-links frozen_packages --no-index

In [None]:
import os, glob
import pandas as pd
import matplotlib.pyplot as plt

DATASET_FOLDER = "/kaggle/input/hubmap-organ-segmentation"
path_csv = os.path.join(DATASET_FOLDER, "train.csv")
df_train = pd.read_csv(path_csv)
display(df_train.head())

# Explore metadata

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
for i, col in enumerate(["organ", "sex"]):
    _= df_train[[col]].value_counts().plot.pie(ax=axes[i], autopct='%1.1f%%', ylabel=col)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(12, 4))
for i, col in enumerate(["data_source", "tissue_thickness", "pixel_size"]):
    _= df_train[[col]].value_counts().plot.pie(ax=axes[i], autopct='%1.1f%%', ylabel=col)

In [None]:
_= df_train[["age"]].hist(bins=35, figsize=(8, 4))

## Image sizes histograms

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(8, 5))
for i, col in enumerate(["img_height", "img_width"]):
    _= df_train[[col]].hist(ax=axes[i], bins=35)

In [None]:
df_train.groupby(["img_height", "img_width"]).size()

# Show some images

In [None]:
import numpy as np

def rle_decode(mask_rle: str, img_shape: tuple = None) -> np.ndarray:
    seq = mask_rle.split()
    starts = np.array(list(map(int, seq[0::2])))
    lengths = np.array(list(map(int, seq[1::2])))
    assert len(starts) == len(lengths)
    ends = starts + lengths
    img = np.zeros((np.product(img_shape),), dtype=np.uint8)
    for begin, end in zip(starts, ends):
        img[begin:end] = 1
    return img.reshape(img_shape)

In [None]:
import matplotlib.pyplot as plt
from skimage import color

fig, axes = plt.subplots(nrows=7, ncols=2, figsize=(9, 30))
for i, row in df_train.iterrows():
    if i >= 14:
        break
    img = plt.imread(os.path.join(DATASET_FOLDER, "train_images", f"{row['id']}.tiff"))
    mask = rle_decode(row['rle'], img_shape=(row["img_height"], row["img_width"]))
    axes[i // 2, i % 2].imshow(color.label2rgb(mask.T, img, bg_label=0, bg_color=(1.,1.,1.), alpha=0.25))
    axes[i // 2, i % 2].set_axis_off()
fig.tight_layout()

# Export masks

In [None]:
! mkdir -p ./train_masks

from PIL import Image
from tqdm.auto import tqdm

for _, row in tqdm(df_train.iterrows(), total=len(df_train)):
    mask = rle_decode(row['rle'], img_shape=(row["img_height"], row["img_width"]))
    segm_path = os.path.join("train_masks", f"{row['id']}.png")
    Image.fromarray(mask.T).save(segm_path)
    # plt.imsave(segm_path, mask.T)

In [None]:
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(9, 18))
for i, row in df_train.iterrows():
    if i >= 4:
        break
    img = plt.imread(os.path.join(DATASET_FOLDER, "train_images", f"{row['id']}.tiff"))
    axes[i, 0].imshow(img)
    mask = np.array(Image.open(os.path.join("train_masks", f"{row['id']}.png")))
    axes[i, 1].imshow(mask)
    print(np.unique(mask[:]))
    # axes[i // 2, i % 2].imshow(color.label2rgb(mask.T, img, bg_label=0, bg_color=(1.,1.,1.), alpha=0.25))
    axes[i, 0].set_axis_off()
    axes[i, 1].set_axis_off()
fig.tight_layout()