In [1]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

## Initialization

### Imports

In [2]:
import os
import sys
import cv2
import json
import glob
import torch
import py_wsi
import tifffile
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px

from tqdm.notebook import tqdm
from collections import Counter
from matplotlib import pyplot as plt

sys.path.append("../code/")

In [3]:
from params import *
from utils.rle import *

from data.dataset import load_image

from utils.metrics import dice_scores_img
from utils.plots import plot_heatmap_preds, plot_contours_preds

### Load

In [4]:
df_info = pd.read_csv(DATA_PATH + f"HuBMAP-20-dataset_information.csv")
df_mask = pd.read_csv(DATA_PATH + "train.csv")

ANNOT_PATH = DATA_PATH + "annotation_v3/"

In [5]:
PLOT = False
ADD_FC = True

### Train data

In [6]:
new_df = df_mask.copy().set_index('id')

for id_ in tqdm(df_mask['id']):
    print(f' -> {id_}')
    if id_ + ".json" in os.listdir(ANNOT_PATH):        
        annot = json.load(open(ANNOT_PATH + id_ + ".json", 'r'))
        
        w, h = df_info[df_info['image_file'] == id_ + '.tiff'][['width_pixels', 'height_pixels']].values[0]
        
        rle = df_mask[df_mask['id'] == id_]['encoding']
        
#         mask = enc2mask(rle, (w, h)).astype(np.uint8)  # smh not working
        mask = np.zeros((h, w), dtype=np.uint8)
        mask += enc2mask(rle, (w, h)).astype(np.uint8)
        
        added = 0
        for info in annot:
            label = info['properties']['classification']['name']

            if label == "FC":
                if not ADD_FC or id_ != "aaa6a05cc":
                    continue

            poly = info['geometry']['coordinates']
            try:
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            except ValueError:
                poly = np.concatenate([np.array(poly[i]).squeeze() for i in range(len(poly))])
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            added +=1
            
        print(f"Added {added} glomerulis")
        
        new_df.loc[id_] = rle_encode_less_memory(mask)
        
        if PLOT:
            img = load_image(os.path.join(TIFF_PATH_4, id_ + ".tiff"), full_size=False)
            
            mask = cv2.resize(
                mask,
                (w // 4, h // 4),
                interpolation=cv2.INTER_NEAREST,
            )
            assert mask.shape == img.shape[:2], (mask.shape, img.shape)
        
            fig = plot_contours_preds(img, mask, w=1, downsize=4)
            w = 1000
            h = int(w *  mask.shape[0] / mask.shape[1])
            fig.update_layout(
                autosize=False,
                width=w,
                height=h,
            )

            fig.show()

            break

if not PLOT:
    name = "train_fix.csv" if not ADD_FC else "train_fc.csv"
    new_df.to_csv(DATA_PATH + name)
    
    print(f'\n -> Saved masks to {DATA_PATH + name}')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=15.0), HTML(value='')))

 -> 2f6ecfcdf
Added 0 glomerulis
 -> 8242609fa
Added 4 glomerulis
 -> aaa6a05cc
Added 25 glomerulis
 -> cb2d976f4
Added 1 glomerulis
 -> b9a3865fc
Added 4 glomerulis
 -> b2dc8411c
Added 1 glomerulis
 -> 0486052bb
 -> e79de561c
Added 0 glomerulis
 -> 095bf7a1f
Added 1 glomerulis
 -> 54f2eec69
Added 1 glomerulis
 -> 4ef6695ce
Added 1 glomerulis
 -> 26dc41664
 -> c68fe75ea
Added 2 glomerulis
 -> afa5e8098
Added 17 glomerulis
 -> 1e2425f28
Added 2 glomerulis


 -> Saved masks to ../input/train_fc.csv


### Extra data

In [23]:
PLOT = False
SAVE_TIFF = False
SAVE = True
ADD_FC = True

In [24]:
turtle = py_wsi.Turtle(
    DATA_PATH + "extra/", 
    DATA_PATH + "extra/",
    "extra",
)

rles = {}

for file in tqdm(turtle.files):
    id_ = file[:-4]
    print(f' -> {id_}')
    
    if os.path.exists(ANNOT_PATH + id_ + ".json"):
    
        level_count, _, level_dims = turtle.retrieve_tile_dimensions(file, patch_size=-1)
        shape = level_dims[level_count - 1]

        img = turtle.retrieve_sample_patch(file, shape[0], level_count - 1)
        img = np.array(img, dtype=np.uint8)

        annot = json.load(open(ANNOT_PATH + id_ + ".json", 'r'))

        mask = np.zeros(shape[::-1], dtype=np.uint8)

        added = 0
        for info in annot:
            poly = np.array(info['geometry']['coordinates'])
            
            try:
                label = info['properties']['classification']['name']
            except KeyError:
                label = "G"
                
            if not ADD_FC and label == "FC":
                continue
                
            poly = info['geometry']['coordinates']
            try:
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            except ValueError:
                poly = np.concatenate([np.array(poly[i]).squeeze() for i in range(len(poly))])
                mask = cv2.fillPoly(mask, np.int32([poly]), True)
            added += 1
                
        assert mask.max() == 1
        
        
        print(f"Added {added} glomerulis")
        
        if PLOT:
            print('plot')
            fig = plot_contours_preds(img, mask, w=2, downsize=8)

            w = 1000
            h = int(w *  mask.shape[0] / mask.shape[1])
            fig.update_layout(
                autosize=False,
                width=w,
                height=h,
            )

            fig.show()

            break
            
        if SAVE:
            print(img.shape)
            img = cv2.resize(
                img,
                (img.shape[1] // 2, img.shape[0] // 2),
                interpolation=cv2.INTER_AREA,
            )
            print(img.shape)

            if SAVE_TIFF:
                if not os.path.exists(DATA_PATH + "extra_tiff/"):
                    os.mkdir(DATA_PATH + "extra_tiff/")
                tifffile.imsave(DATA_PATH + "extra_tiff/" + f"{id_}.tiff", img)

            print(mask.shape)
            mask = cv2.resize(
                mask,
                (mask.shape[1] // 2, mask.shape[0] // 2),
                interpolation=cv2.INTER_NEAREST,
            )
            print(mask.shape)

            rles[id_] = rle_encode_less_memory(mask)

31 WSI found in directory.


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=31.0), HTML(value='')))

 -> SESCAM_1_0
Setting patch size -1 and tile size -1


  poly = np.array(info['geometry']['coordinates'])


Added 61 glomerulis
 -> SESCAM_102
Setting patch size -1 and tile size -1
Added 23 glomerulis
 -> SAS_21904_001
Setting patch size -1 and tile size -1
Added 10 glomerulis
 -> SAS_21908_001
Setting patch size -1 and tile size -1
Added 7 glomerulis
 -> VUHSK_1502
Setting patch size -1 and tile size -1
Added 36 glomerulis
 -> SESCAM_7_0
Setting patch size -1 and tile size -1
Added 18 glomerulis
 -> VUHSK_2072
Setting patch size -1 and tile size -1
Added 20 glomerulis
 -> SESCAM_9_0
Setting patch size -1 and tile size -1
Added 45 glomerulis
 -> SAS_21937_001
Setting patch size -1 and tile size -1
Added 17 glomerulis
 -> SAS_21883_001
Setting patch size -1 and tile size -1
Added 11 glomerulis
 -> SAS_21891_001
Setting patch size -1 and tile size -1
Added 13 glomerulis
 -> VUHSK_1912
Setting patch size -1 and tile size -1
Added 114 glomerulis
 -> SESCAM_5_0
Setting patch size -1 and tile size -1
Added 36 glomerulis
 -> SESCAM_2_0
Setting patch size -1 and tile size -1
Added 23 glomerulis
 ->

In [25]:
df_annot_extra = pd.DataFrame.from_dict(rles, orient='index', columns=['encoding'])

if SAVE and not PLOT:
    name = "train_extra.csv" if not ADD_FC else "train_extra_fc.csv"
    df_annot_extra.to_csv(DATA_PATH + name)
    print(f'\n -> Saved masks to {DATA_PATH + name}')


 -> Saved masks to ../input/train_extra_fc.csv
