In [None]:
!pip install monai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting monai
  Downloading monai-0.9.0-202206131636-py3-none-any.whl (939 kB)
[K     |████████████████████████████████| 939 kB 14.3 MB/s 
Installing collected packages: monai
Successfully installed monai-0.9.0


In [None]:
import os
# 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/MyDrive/open_directory/test/IMAGE')
!pwd

Mounted at /content/drive
/content/drive/MyDrive/open_directory/test/IMAGE


In [None]:
import sys

sys.path.append('../monai-v081/')

import pandas as pd
import json
import torch
import os
import numpy as np
from glob import glob

## Process train df

In [None]:
# Open the training dataframe and display the initial dataframe
# the way to process the df refers to:
# https://www.kaggle.com/code/dschettler8845/uwmgit-deeplabv3-end-to-end-pipeline-tf

DATA_DIR = "./dataset/uw-madison-gi-tract-image-segmentation/"

TRAIN_CSV = os.path.join(DATA_DIR,"train.csv")
train_df = pd.read_csv(TRAIN_CSV)

# Get all training images
# all_train_images = glob(os.path.join(DATA_DIR, "train", "**", "*.png"), recursive=True)

In [None]:
import pickle
# load
with open("all_train_images.pickle","rb") as fr:
  all_train_images =pickle.load(fr)

In [None]:
all_train_images[:1]

['./dataset/uw-madison-gi-tract-image-segmentation/train/case15/case15_day0/scans/slice_0006_266_266_1.50_1.50.png']

In [None]:
from sklearn.model_selection import StratifiedGroupKFold
import numpy as np

def get_filepath_from_partial_identifier(_ident, file_list):
    return [x for x in file_list if _ident in x][0]

# csv로 불러온 df를 preprocessing
def df_preprocessing(df, globbed_file_list, is_test=False):

    # fold 만들려면 해야하는 작업
    df["segmentation2"] = df.segmentation.fillna('')
    df["rle_len"] = df.segmentation2.map(len) # length of each rle mask


    # df2 는 한 id행에 seg3개 다 들어감. 필요한건 rel_len뿐
    # df2 = df.groupby(['id'])['segmentation2'].agg(list).to_frame().reset_index() # rle list of each id
    # df2 = df2.merge(df.groupby(['id'])['rle_len'].agg(sum).to_frame().reset_index()) # total length of all rles of each id
    df2 = df.groupby(["id"])["rle_len"].agg(sum).to_frame().reset_index() # total length of all rles of each id
    df = df.drop(columns=["segmentation2", "rle_len"])

    # id 하나씩으로 한뒤 맨마지막에. -> # 8.번으로
    # df = df.merge(df2, on=['id'])
    # df['empty'] = (df.rle_len==0) # empty masks


    """ The preprocessing steps applied to get column information """
    # 1. Get Case-ID as a column (str and int)
    # 1. "case_id_str" 컬럼 생성. ex.id = case123_day20_slice_0001 -> case_id_str = case_123
    df["case_id_str"] = df["id"].apply(lambda x: x.split("_", 2)[0])

    # 2. Get Day as a column
    # 2. "day_num_str" 컬럼 생성. ex. day20
    df["day_num_str"] = df["id"].apply(lambda x: x.split("_", 2)[1])

    # 3. Get Slice Identifier as a column
    # 3. "slice_id" 컬럼 생성. ex, slice_0001
    df["slice_id"] = df["id"].apply(lambda x: x.split("_", 2)[2])

    # 4. Get full file paths for the representative scans
    # 4. "f_path" 컬럼 생성
    df["_partial_ident"] = (globbed_file_list[0].rsplit("/", 4)[0]+"/"+ # /kaggle/input/uw-madison-gi-tract-image-segmentation/train/
                           df["case_id_str"]+"/"+ # .../case###/
                           df["case_id_str"]+"_"+df["day_num_str"]+ # .../case###_day##/
                           "/scans/"+df["slice_id"]) # .../slice_#### 
    _tmp_merge_df = pd.DataFrame({"_partial_ident":[x.rsplit("_",4)[0] for x in globbed_file_list], "f_path":globbed_file_list})
    df = df.merge(_tmp_merge_df, on="_partial_ident").drop(columns=["_partial_ident"])

    # 5. Get slice dimensions from filepath (int in pixels)
    # 5. f_path 컬럼으로 부터 slice_w, slice_h 컬럼 생성. 1.5_1.5, 1.63_1.63
    df["slice_w"] = df["f_path"].apply(lambda x: int(x[:-4].rsplit("_",4)[2]))
    df["slice_h"] = df["f_path"].apply(lambda x: int(x[:-4].rsplit("_",4)[1]))

    # 6. Pixel spacing from filepath (float in mm)
    df["px_spacing_h"] = df["f_path"].apply(lambda x: float(x[:-4].rsplit("_",4)[3]))
    df["px_spacing_w"] = df["f_path"].apply(lambda x: float(x[:-4].rsplit("_",4)[4]))

    # is_test가 아니면, False이면, -> 기본값
    if not is_test:
        # 7. Merge 3 Rows Into A Single Row (As This/Segmentation-RLE Is The Only Unique Information Across Those Rows)
        l_bowel_df = df[df["class"]=="large_bowel"][["id", "segmentation"]].rename(columns={"segmentation":"lb_seg_rle"})
        s_bowel_df = df[df["class"]=="small_bowel"][["id", "segmentation"]].rename(columns={"segmentation":"sb_seg_rle"})
        stomach_df = df[df["class"]=="stomach"][["id", "segmentation"]].rename(columns={"segmentation":"st_seg_rle"})
        df = df.merge(l_bowel_df, on="id", how="left")
        df = df.merge(s_bowel_df, on="id", how="left")
        df = df.merge(stomach_df, on="id", how="left")
        df = df.drop_duplicates(subset=["id",]).reset_index(drop=True) # 중복삭제

    # 8. [for Fold] id 하나씩으로 한뒤 맨마지막에
    df = df.merge(df2, on=["id"])
    df["empty"] = (df.rle_len==0) # empty masks


    # 9. fold 생성 empty/ case_id_str 있어야 만들 수 있는 fold
    # df["case"] = df["id"].apply(lambda x: x.split("_", 2)[0].replace('case',''))
    skf = StratifiedGroupKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
    for fold, (train_idx, val_idx) in enumerate(skf.split(df, df["empty"], groups = df["case_id_str"])):
        df.loc[val_idx, 'fold'] = fold  # 3개로 접혀있다는 뜻일까?

    df["fold"] = df["fold"].astype(np.uint8)

    # 10. Reorder columns to the a new ordering (drops class and segmentation as no longer necessary)
    new_col_order = ["id", "f_path",
                     "lb_seg_rle",
                     "sb_seg_rle", 
                     "st_seg_rle",
                     "slice_h", "slice_w", "px_spacing_h", 
                     "px_spacing_w", "case_id_str", 
                     "day_num_str", "slice_id","fold"]
    if is_test: new_col_order.insert(1, "class")
    new_col_order = [_c for _c in new_col_order if _c in df.columns]
    df = df[new_col_order]

    return df

In [None]:
# 에러 유발하는 괄호있는 중복데이터 제거 왜인지는 모르겠는데 두 번 실행해줘....
for a in all_train_images:
  if '(' in a:
    all_train_images.remove(a)
len(all_train_images)

38496

fold 만들기

In [None]:
class CFG:
    seed          = 101
    # debug         = False # set debug=False for Full Training
    # exp_name      = 'Baselinev2'
    # comment       = 'unet-efficientnet_b1-224x224-aug2-split2'
    # model_name    = 'Unet'
    # backbone      = 'efficientnet-b1'
    # train_bs      = 128
    # valid_bs      = train_bs*2
    # img_size      = [224, 224]
    # epochs        = 15
    # lr            = 2e-3
    # scheduler     = 'CosineAnnealingLR'
    # min_lr        = 1e-6
    # T_max         = int(30000/train_bs*epochs)+50
    # T_0           = 25
    # warmup_epochs = 0
    # wd            = 1e-6
    # n_accumulate  = max(1, 32//train_bs)
    n_fold        = 5
    # num_classes   = 3
    # device        = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

train_df = df_preprocessing(train_df, all_train_images)
train_df.head()

Unnamed: 0,id,f_path,lb_seg_rle,sb_seg_rle,st_seg_rle,slice_h,slice_w,px_spacing_h,px_spacing_w,case_id_str,day_num_str,slice_id,fold
0,case123_day20_slice_0001,./dataset/uw-madison-gi-tract-image-segmentati...,,,,266,266,1.5,1.5,case123,day20,slice_0001,3
1,case123_day20_slice_0002,./dataset/uw-madison-gi-tract-image-segmentati...,,,,266,266,1.5,1.5,case123,day20,slice_0002,3
2,case123_day20_slice_0003,./dataset/uw-madison-gi-tract-image-segmentati...,,,,266,266,1.5,1.5,case123,day20,slice_0003,3
3,case123_day20_slice_0004,./dataset/uw-madison-gi-tract-image-segmentati...,,,,266,266,1.5,1.5,case123,day20,slice_0004,3
4,case123_day20_slice_0005,./dataset/uw-madison-gi-tract-image-segmentati...,,,,266,266,1.5,1.5,case123,day20,slice_0005,3


In [None]:
 # ref: https://www.kaggle.com/paulorzp/run-length-encode-and-decode
# modified from: https://www.kaggle.com/inversion/run-length-decoding-quick-start
def rle_decode(mask_rle, shape, color=1):
    """ TBD
    
    Args:
        mask_rle (str): run-length as string formated (start length)
   
    Returns: 
        Mask (np.array)
            - 1 indicating mask
            - 0 indicating background

    """
    # Split the string by space, then convert it into a integer array
    s = np.array(mask_rle.split(), dtype=int)

    # Every even value is the start, every odd value is the "run" length
    starts = s[0::2] - 1
    lengths = s[1::2]
    ends = starts + lengths # tlwkrwja

    # The image image is actually flattened since RLE is a 1D "run"
    if len(shape)==3:
        h, w, d = shape
        img = np.zeros((h * w, d), dtype=np.float32)
    else:
        h, w = shape
        img = np.zeros((h * w,), dtype=np.float32)

    # The color here is actually just any integer you want!
    for lo, hi in zip(starts, ends):
        img[lo : hi] = color
        
    # Don't forget to change the image back to the original shape
    return img.reshape(shape)

In [None]:
def load_img_mask(l):
    img_data = loader(l.f_path)
    img_h, img_w = img_data[0].shape
    shape = (l.slice_h, l.slice_w)
    assert shape == (img_h, img_w)  # 사이즈를 보증하는 코드. 아니면 에러 메시지 뜬다.
    wh_shape = (img_w, img_h)
    if pd.isna(l.lb_seg_rle):       # 대장 rle가 결측값이면,
        lb_mask = np.zeros(wh_shape)# 0행렬
    else:
        lb_mask = rle_decode(l.lb_seg_rle, wh_shape)  # 결측값 아니면 디코드 과정 통해서 색을 입힙니다.
        
    if pd.isna(l.sb_seg_rle):
        sb_mask = np.zeros(wh_shape)
    else:
        sb_mask = rle_decode(l.sb_seg_rle, wh_shape)
        
    if pd.isna(l.st_seg_rle):
        st_mask = np.zeros(wh_shape)
    else:
        st_mask = rle_decode(l.st_seg_rle, wh_shape)
    
    all_mask = np.stack([lb_mask, sb_mask, st_mask], axis=0).astype(np.uint8)   # 아래 방향으로 합쳐주기.
    # multiclass mask,
    mask_arr = st_mask*3                                #왜 위장 마스크에 *3 해주는거지?
    mask_arr = np.where(sb_mask==1, 2, mask_arr)
    mask_arr = np.where(lb_mask==1, 1, mask_arr)
    
    return img_data[0], all_mask, mask_arr

In [None]:
from monai.transforms import LoadImage
from monai.data import NibabelWriter

loader = LoadImage()

## Load 3D images and masks and save to Nibabel format

The reason to use Nibabel format is that spacing information can be added into it, it can be used with some MONAI transforms
Both multi-label masks (for validation) and multi-class masks (for training) are produced, since I felt hard to tune a multi-label 3D model.

In [None]:
output_dir = "./3d_data/"

In [None]:
data_3d_info = [] # 판다스로 만들기 위해 빈 리스트 준비
ct = 0
for group in train_df.groupby(["case_id_str", "day_num_str"]):

    case_3d_img, case_3d_mask, case_3d_mask_multiclass = [], [], []
    
    case_id_str, day_num_str = group[0]
    group_id = case_id_str + "_" + day_num_str
    group_df = group[1].sort_values("slice_id", ascending=True) # "slice_id"순으로 오름차순 정렬
    n_slices = group_df.shape[0]
    for idx in range(n_slices):
        slc = group_df.iloc[idx]
        slc_img, slc_mask, slc_multiclass_mask = load_img_mask(slc)
        case_3d_img.append(slc_img)
        case_3d_mask.append(slc_mask)
        case_3d_mask_multiclass.append(slc_multiclass_mask)
    
    case_3d_img = np.stack(case_3d_img, axis=-1)
    case_3d_mask = np.stack(case_3d_mask, axis=-1)
    case_3d_mask = np.transpose(case_3d_mask, [2, 1, 3, 0]) # c w h d to h w d c
    case_3d_mask_multiclass = np.stack(case_3d_mask_multiclass, axis=-1)
    case_3d_mask_multiclass = np.transpose(case_3d_mask_multiclass, [1, 0, 2]) # w h d to h w d

    assert np.all(case_3d_mask.astype(np.uint8) == case_3d_mask)  # 검증용 코드
    case_3d_mask = case_3d_mask.astype(np.uint8)

    if case_3d_mask.shape[:-1] != case_3d_img.shape:              # 만약 일치하지 않으면 형태가 그룹에 맞지 않은 id 출력
        print("shape not match on group: ", group_id)

    group_spacing = group[1][["px_spacing_h"]].values[0][0]       # 1.5

    group_affine = np.eye(4) * group_spacing                      # 단위행렬 4*4  * 1.5 곱해줌
    """
    affine translation 과정 
    픽셀의 배치 구조를 변경함으로써 평행 이동, 확대 및 축소, 회전 등 전체 영상의 모양을 바꾸는 기하학적 변환
    변환, 회전, 확대 등을 행렬 곱으로 표현한 것. 
    [1.5,0,0,0,
     0,1.5,0,0,
     0,0,1.5,0,
     0,0,0,1.5]
    """
    # Update: https://www.kaggle.com/competitions/uw-madison-gi-tract-image-segmentation/discussion/319053
    # all z-axis spacing is 3
    group_affine[-2][-2] = 3.0
    group_affine[-1][-1] = 1.0  # 4*4 행렬로 곱해주려면 마지막은 무조건 1로 바꿔줘야 한다.
    """
    [1.5,0,0,0,
     0,1.5,0,0,
     0,  0,3,0,
     0,  0,0,1] 을 곱해주는 겁니다.
    """
    group_fold = group[1][["fold"]].values[0][0]

    group_root_dir = os.path.join(output_dir, "train", case_id_str, group_id)
    os.makedirs(group_root_dir)
    # write image
    writer = NibabelWriter()  # Nibabel을 사용하여 디스크의 파일에 데이터와 메타데이터를 씁니다.
    writer.set_data_array(case_3d_img, channel_dim=None)
    # affine, 원래 affine 및 공간 모양 정보에 대한 메타데이터 사전
    writer.set_metadata({"affine": group_affine, "original_affine": group_affine, "dtype": np.int16})
    writer.write(f"{group_root_dir}/{group_id}_image.nii.gz", verbose=False)

    # write mask
    writer = NibabelWriter()  
    writer.set_data_array(case_3d_mask, channel_dim=-1)
    writer.set_metadata({"affine": group_affine, "original_affine": group_affine, "dtype": np.uint8})
    writer.write(f"{group_root_dir}/{group_id}_mask.nii.gz", verbose=False)
    
    # write mask multiclass
    writer = NibabelWriter()
    writer.set_data_array(case_3d_mask_multiclass, channel_dim=None)
    writer.set_metadata({"affine": group_affine, "original_affine": group_affine, "dtype": np.uint8})
    writer.write(f"{group_root_dir}/{group_id}_mask_multiclass.nii.gz", verbose=False)

    data_3d_info.append({
        "id": group_id,
        "fold": group_fold,
        "image_path": f"{group_root_dir}/{group_id}_image.nii.gz",
        "mask_path": f"{group_root_dir}/{group_id}_mask.nii.gz",
        "mask_multiclass_path": f"{group_root_dir}/{group_id}_mask_multiclass.nii.gz",
    })

    ct += 1
    print("finish: ", ct, " shape: ", case_3d_mask.shape) # h w d c 세로, 가로, 슬라이스 개수, 


In [None]:
# data_3d_info = pd.DataFrame(data_3d_info)

In [None]:
# data_3d_info.to_csv("data_3d_info.csv", index=False)

In [None]:
data_3d_info = pd.read_csv('data_3d_info.csv')

In [None]:
for fold in range(5):
    train_data, val_data = [], []
    train_df = data_3d_info[data_3d_info["fold"] != fold]
    val_df = data_3d_info[data_3d_info["fold"] == fold]
    
    # 훈련용 데이터/테스트 데이터 분리
    for line in train_df.values:
        train_data.append({"image": line[2], "mask": line[3], "mask_multiclass": line[4], "id": line[0]}) # key값 지정해서 해당 값 입력. 딕셔너리 형태로 저장.
    for line in val_df.values:
        val_data.append({"image": line[2], "mask": line[3], "mask_multiclass": line[4], "id": line[0]})

    all_data = {"train": train_data, "val": val_data}
    
    # with open(f"dataset_3d_fold_{fold}.json", 'w') as f:
    #     json.dump(all_data, f)

In [None]:
all_data.info()

NameError: ignored

### 두두두두두두두두둗

./3d_data/

### 연습

In [None]:
image = []
mask = []
for a, b in enumerate(all_data['train']):
  image.append(b['image'])
  mask.append(b['mask'])
  print(a)

In [None]:
mask

In [None]:
train_df[:1]

Unnamed: 0,id,fold,image_path,mask_path,mask_multiclass_path
0,case101_day20,1,/kaggle/working/train/case101/case101_day20/ca...,/kaggle/working/train/case101/case101_day20/ca...,/kaggle/working/train/case101/case101_day20/ca...


In [None]:
all_data.keys()

NameError: ignored

In [None]:
all_train_images

In [None]:
OUTPUT_CHANNELS = 3

In [None]:
import tensorflow as tf

In [None]:
def normalize(input_image, input_mask):
  input_image = tf.cast(input_image, tf.float32) / 255.0  # 영상을 [0,1]로 정규화. 보통 조건이 맞으면 1, 아니면 0.
  input_mask -= 1
  return input_image, input_mask

In [None]:
# 이미지를 로드할 때 X, y 사이즈 (128, 128)로 조정, 확률적으로(0.5) 좌우대칭하고, 각각 정규화
# 파이썬 함수 텐서에서 사용하겠다는 말.
# @tf.function  
def load_image_train(datapoint):
  input_image = tf.image.resize(datapoint['image'], (128, 128))
  input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128))

  if tf.random.uniform(()) > 0.5: # 50% 확률로 좌우대칭
    input_image = tf.image.flip_left_right(input_image)
    input_mask = tf.image.flip_left_right(input_mask)

  input_image, input_mask = normalize(input_image, input_mask)  # tf.cast / 255, 마스크에서 1 빼주고

  return input_image, input_mask

# @tf.function
def load_image_test(datapoint):
  input_image = tf.image.resize(datapoint['image'], (128, 128)) # X, y 사이즈 (128, 128)로 조정, 각각 정규화
  input_mask = tf.image.resize(datapoint['segmentation_mask'], (128, 128))

  input_image, input_mask = normalize(input_image, input_mask)

  return input_image, input_mask

In [None]:
base_model = tf.keras.applications.MobileNetV2(input_shape=[128, 128, 3], include_top=False)

#이 층들의 활성화를 이용합시다
layer_names = [
    'block_1_expand_relu',   # 64x64 x  96
    'block_3_expand_relu',   # 32x32 x 144
    'block_6_expand_relu',   # 16x16 x 192
    'block_13_expand_relu',  # 8x8   x 576
    'block_16_project',      # 4x4   x 320
]
layerOuts = [base_model.get_layer(name).output for name in layer_names]

# 잠재특징추출 모델을 만듭시다
down_stack = tf.keras.Model(inputs=base_model.input, outputs=layerOuts) # MobileNetV2의 input  / 위의 레이어에서 출력된 값 = outputs

down_stack.trainable = False

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_128_no_top.h5


In [None]:
up_stack = [
    .upsample(512, 3),  # 4x4 -> 8x8 x 512 업샘플하면 이미지 두배커짐!
    .upsample(256, 3),  # 8x8 -> 16x16 x 256
    .upsample(128, 3),  # 16x16 -> 32x32 x 128
    .upsample(64, 3),   # 32x32 -> 64x64 x 64
]

NameError: ignored

In [None]:
all_data['train']

[{'id': 'case101_day20',
  'image': '/kaggle/working/train/case101/case101_day20/case101_day20_image.nii.gz',
  'mask': '/kaggle/working/train/case101/case101_day20/case101_day20_mask.nii.gz',
  'mask_multiclass': '/kaggle/working/train/case101/case101_day20/case101_day20_mask_multiclass.nii.gz'},
 {'id': 'case101_day22',
  'image': '/kaggle/working/train/case101/case101_day22/case101_day22_image.nii.gz',
  'mask': '/kaggle/working/train/case101/case101_day22/case101_day22_mask.nii.gz',
  'mask_multiclass': '/kaggle/working/train/case101/case101_day22/case101_day22_mask_multiclass.nii.gz'},
 {'id': 'case101_day26',
  'image': '/kaggle/working/train/case101/case101_day26/case101_day26_image.nii.gz',
  'mask': '/kaggle/working/train/case101/case101_day26/case101_day26_mask.nii.gz',
  'mask_multiclass': '/kaggle/working/train/case101/case101_day26/case101_day26_mask_multiclass.nii.gz'},
 {'id': 'case101_day32',
  'image': '/kaggle/working/train/case101/case101_day32/case101_day32_image.ni