In [None]:
# %load_ext nb_black
%load_ext autoreload
%autoreload 2

## Initialization

### Imports

In [None]:
import os
import sys
import cv2
import json
import glob
import torch
import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px

from tqdm.notebook import tqdm
from collections import Counter
from matplotlib import pyplot as plt

sys.path.append("../code/")

In [None]:
from params import *
from utils.rle import *

from data.dataset import load_image

from utils.metrics import dice_scores_img
from utils.plots import plot_heatmap_preds, plot_contours_preds

### Load

In [None]:
df_info = pd.read_csv(DATA_PATH + f"HuBMAP-20-dataset_information.csv")
df_mask = pd.read_csv(DATA_PATH + "train_4.csv")
# df = pd.read_csv(OUT_PATH + "df_images.csv")

### Data

In [None]:
root = TIFF_PATH_4
rle_path = DATA_PATH + "train_4.csv"
reduce_factor = 1
rles = pd.read_csv(rle_path)

### Experiment

In [None]:
log_folder = "../logs/2021-04-05/4/"  # b1

In [None]:
class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)

config = json.load(open(log_folder + 'config.json', 'r'))
config = Config(**config)

In [None]:
global_threshold = 0.4

In [None]:
preds = glob.glob(log_folder + "pred_*.npy")

In [None]:
preds

## Train viz

In [None]:
THRESHOLD = 0.4

In [None]:
NAMES = [
    "b9a3865fc",
    "aaa6a05cc",
    "e79de561c",
    "8242609fa",
    "2f6ecfcdf",
    "0486052bb",
    "26dc41664",
    "afa5e8098",
    "54f2eec69",
    "cb2d976f4",
    "4ef6695ce",
    "095bf7a1f",
    "1e2425f28",
    "c68fe75ea",
    "b2dc8411c",
]

In [None]:
mask_name = "aaa6a05cc"

idx = [i for i, path in enumerate(preds) if mask_name in path][0]

In [None]:
probs = np.load(preds[idx]).astype(np.float32)
pred = (probs > THRESHOLD).astype(np.uint8)

In [None]:
img = load_image(os.path.join(TIFF_PATH_4, mask_name + ".tiff"), full_size=False)

In [None]:
rle = df_mask[df_mask['id'] == mask_name]['encoding']
mask = enc2mask(rle, (img.shape[1], img.shape[0]))

In [None]:
mask.shape, img.shape, pred.shape

In [None]:
score = dice_scores_img(pred , mask)
print(f'Score for downscaled image is {score:.4f}')

### Plot

In [None]:
w = 1000
h = int(w *  mask.shape[0] / mask.shape[1])

In [None]:
fig = plot_contours_preds(img, pred, mask, w=2, downsize=2)

fig.update_layout(
    autosize=False,
    width=w,
    height=h,
)

fig.show()

In [None]:
# fig = plot_heatmap_preds(img, probs, mask, w=1, downsize=2)

# fig.update_layout(
#     autosize=False,
#     width=w,
#     height=h,
# )

# fig.show()

## Test viz

### Load image

In [None]:
df_test = pd.read_csv(DATA_PATH + "sample_submission.csv")

list(df_test['id'])

In [None]:
name = '57512b7f1'
fold = 0

img = load_image(f'{DATA_PATH}/test_4/{name}.tiff', full_size=False)

In [None]:
probs = np.load(log_folder + f'pred_{name}_{fold}.npy').astype(np.float32)
pred = (probs > THRESHOLD).astype(np.uint8)

## From sub

In [None]:
sub = pd.read_csv(OUT_PATH + "submission_930.csv")
sub2 = pd.read_csv(OUT_PATH + "submission_0929.csv")

for id_ in list(df_test['id']):
    shape = df_info[df_info['image_file'] == id_ + '.tiff'][['width_pixels', 'height_pixels']].values[0]
    
    rle = sub[sub['id'] == id_]['predicted'].values
    sub_mask = enc2mask(rle, shape)
    
    rle_2 = sub2[sub2['id'] == id_]['predicted'].values
    sub_mask_2 = enc2mask(rle_2, shape)
    
    print(id_, dice_scores_img(sub_mask, sub_mask_2))

In [None]:
shape = df_info[df_info['image_file'] == name + '.tiff'][['width_pixels', 'height_pixels']].values[0]

In [None]:
sub = pd.read_csv(OUT_PATH + "submission_930.csv")
rle = sub[sub['id'] == name]['predicted'].values

sub_mask = enc2mask(rle, shape)

sub_mask = cv2.resize(
    sub_mask,
    tuple(list(shape // 4)),
    interpolation=cv2.INTER_NEAREST,
)

In [None]:
sub = pd.read_csv(OUT_PATH + "submission_0929.csv")
rle = sub[sub['id'] == name]['predicted'].values

sub_mask2 = enc2mask(rle, shape)

sub_mask2 = cv2.resize(
    sub_mask2,
    tuple(list(shape // 4)),
    interpolation=cv2.INTER_NEAREST,
)

### Plot

In [None]:
w = 1000
h = int(w * pred.shape[0] / pred.shape[1])

In [None]:
# fig = plot_contours_preds(img, pred, mask=None, w=2, downsize=4)
fig = plot_contours_preds(img, sub_mask, mask=sub_mask2, w=2, downsize=2)

fig.update_layout(
    autosize=False,
    width=w,
    height=h,
)

fig.show()

# Post-processing

In [None]:
def post_process_mask(probs, threshold_max=0.5, threshold_prob=0.4, threshold_comp=0.3, plot=True):
    
    mask = (probs > threshold_comp).astype(np.uint8)
    num_component, components = cv2.connectedComponents(mask, connectivity=8)
    
    processed_mask = np.zeros(mask.shape, np.uint8)

    maxs = []
    removed = 0
    for c in tqdm(range(1, num_component)):
        component = (components == c)
        
        component_prob = probs[component]
        max_prob = component_prob.max()
        
        maxs.append(max_prob)

        if max_prob > threshold_max:  # accept component
            processed_mask[component] = component_prob > threshold_prob
        else:
            removed += 1
    
    print(f'Removed {removed} components.')

    if plot:
        plt.figure(figsize=(15, 5))
        sns.histplot(maxs, bins=50)
        plt.axvline(threshold_max, color="salmon")
        plt.show()

    return processed_mask

In [None]:
THRESHOLD_MAX = 0.9
THRESHOLD_COMP = 0.4
THRESHOLD_PROB = 0.2

PLOT = False

In [None]:
# scores_before = []
# scores_after = []

# for idx, pred in enumerate(preds):
#     mask_name = pred.split('/')[-1].split('_')[1][:-4]
#     print(f'\n  -> Mask {mask_name}')
    
#     rle = df_mask[df_mask['id'] == mask_name]['encoding']
#     img = load_image(os.path.join(TIFF_PATH_4, mask_name + ".tiff"), full_size=False)
#     mask = enc2mask(rle, (img.shape[1], img.shape[0]))
    
#     probs = np.load(pred)
#     pred_mask = (probs > 0.4).astype(np.uint8)

#     pred_pp = post_process_mask(
#         probs, 
#         threshold_comp=THRESHOLD_COMP, 
#         threshold_max=THRESHOLD_MAX, 
#         threshold_prob=THRESHOLD_PROB,
#         plot=PLOT,
#     )
    
#     scores_before.append(dice_scores_img(pred_mask, mask))
#     scores_after.append(dice_scores_img(pred_pp, mask))
    
#     print(f'Score before PP : {scores_before[-1] :.4f}')
#     print(f'Score after PP :  {scores_after[-1] :.4f}')

In [None]:
# print(f'CV before PP : {np.mean(scores_before) :.4f}')
# print(f'CV after PP :  {np.mean(scores_after) :.4f}')

# Glomeruli vote blend

In [None]:
def get_disjoined_components(mask, k=3, iterations=10):
    if iterations:
        mask = cv2.erode(mask, kernel=np.ones((k, k), np.uint8), iterations=iterations)
        
    num_components, components = cv2.connectedComponents(mask, connectivity=8)
    
    if iterations:
        components = cv2.dilate(components.astype(np.uint16), kernel=np.ones((k, k), np.uint8), iterations=iterations)
    return num_components, components

In [None]:
def get_disjoined_components_2(mask, k=3, iterations=10):
    mask = cv2.erode(mask, kernel=np.ones((k, k), np.uint8), iterations=iterations)
    num_components, components = cv2.connectedComponents(mask, connectivity=8)
    components = cv2.dilate(components.astype(np.uint16), kernel=np.ones((k, k), np.uint8), iterations=iterations)
    
    x_coords = np.arange(mask.shape[0])
    y_coords = np.arange(mask.shape[1])
    
    current_idx = num_components
    for c in tqdm(range(1, num_components)):
        component = (components == c) 
        
        if component.sum() > 50 ** 2:  # too big!
            x_comp = x_coords[component.sum(1) > 0][[0, -1]]
            y_comp = y_coords[component.sum(0) > 0][[0, -1]]
            box = (x_comp[0] - 10, x_comp[1] + 10, y_comp[0] - 10, y_comp[1] + 10)
            
            component = cv2.erode(
                component.astype(np.uint8), 
                kernel=np.ones((k, k), np.uint8), 
                iterations=iterations + 5
            )
                
            num_components_bis, components_bis = cv2.connectedComponents(component, connectivity=8)
            
            if num_components_bis > 1:
                components_bis = cv2.dilate(
                    components_bis.astype(np.uint16), 
                    kernel=np.ones((k, k), np.uint8), 
                    iterations=iterations + 5
                )
                    
#                 print(num_components_bis)
                plt.figure(figsize=(10, 5))
                plt.subplot(1, 3, 1)
                plt.imshow(components[box[0]: box[1], box[2]: box[3]])
                plt.subplot(1, 3, 2)
                plt.imshow(components_bis[box[0]: box[1], box[2]: box[3]])
                plt.show()
                

                components[components == c] = 0
                components_bis[components_bis > 0] += current_idx
                current_idx += num_components_bis
                
                components += components_bis.astype(np.uint16)

    num_components = len(np.unique(components))
        
    return num_components, components

In [None]:
import joblib
from numba import jit

@jit(nopython=True)
def get_component_votes(probs, component, threshold_max, x0, x1, y0, y1):
    return [np.max(component[x0: x1, y0: y1] * prob[x0: x1, y0: y1]) > threshold_max for prob in probs]

def get_component_vote(prob, component, threshold_max):
    return np.max(prob[component]) > threshold_max

In [None]:
def glomeruli_vote(probs, img, threshold_vote=0.5, threshold_prob=0.4, threshold_max=0.7, plot=True, k=3, iterations=10):

    probs = np.array(probs)
    
    # Get all glomerulis
    mask_union = ((probs > threshold_prob).sum(0) > 0).astype(np.uint8)
    
    num_components, components = get_disjoined_components(
        mask_union, k=k, iterations=iterations
    )
    
    x_coords = np.arange(mask_union.shape[0])
    y_coords = np.arange(mask_union.shape[1])
    
    processed_mask = np.zeros(mask_union.shape, np.float32)

    for c in tqdm(range(1, num_components)):
        component = (components == c)
        
        x0, x1 = x_coords[component.sum(1) > 0][[0, -1]]
        y0, y1 = y_coords[component.sum(0) > 0][[0, -1]]
        x1 += 1
        y1 += 1
        
        component_probs = [prob[x0: x1, y0: y1][component[x0: x1, y0: y1]] for prob in probs]
        
#         votes = [np.percentile(component_prob, 95) > threshold_max for component_prob in component_probs]
        votes = [np.max(component_prob) > threshold_max for component_prob in component_probs]
        vote = np.sum(votes) >= threshold_vote * len(probs)
        
        if vote:
            if iterations:  # compensate for flaws at the border
                component = cv2.dilate(component.astype(np.uint8), kernel=np.ones((k, k), np.uint8), iterations=5)

            processed_mask[x0: x1, y0: y1] += component[x0: x1, y0: y1] * np.mean(
                [probs[i][x0: x1, y0: y1] for i, v in enumerate(votes) if v], 0
            )
        
        if plot and not c % 50:
            x_comp = x_coords[component.sum(1) > 0][[0, -1]]
            y_comp = y_coords[component.sum(0) > 0][[0, -1]]
            box = (x_comp[0] - 10, x_comp[1] + 10, y_comp[0] - 10, y_comp[1] + 10)
            
            glom = img[box[0]: box[1], box[2]: box[3]]
            glom_orig = mask_union[box[0]: box[1], box[2]: box[3]]
            glom_proc = components[box[0]: box[1], box[2]: box[3]] == c
            
            print(votes, '->', vote)
            plt.figure(figsize=(15, 5))
            plt.subplot(1, 3, 1)
            plt.imshow(glom)
            plt.subplot(1, 3, 2)
            plt.imshow(glom_orig)
            plt.subplot(1, 3, 3)
            plt.imshow(glom_proc)
            plt.show()

    return processed_mask

In [None]:
log_folders = [
    "../logs/2021-04-01/2/",
    "../logs/2021-04-02/3/",
    "../logs/2021-04-05/4/",
    "../logs/2021-04-04/1/",
    "../logs/2021-04-08/2/",
#     "../logs/2021-04-08/5/",
#     "../logs/2021-04-11/3/",
#     "../logs/2021-04-14/0/",
]

In [None]:
THRESHOLD_VOTE = 0.5
THRESHOLD_PROB = 0.4
THRESHOLD_MAX = 0.5
THRESHOLD = 0.4

PLOT = False

In [None]:
scores_blend = []
scores_vote = []

for idx, mask_name in enumerate(df_mask['id']):
    print(f'\n  -> Mask {mask_name}')
    
    rle = df_mask[df_mask['id'] == mask_name]['encoding']
    img = load_image(os.path.join(TIFF_PATH_4, mask_name + ".tiff"), full_size=False)
    mask = enc2mask(rle, (img.shape[1], img.shape[0]))
    
    probs = [np.load(log_folder + f'pred_{mask_name}.npy').astype(np.float32) for log_folder in log_folders]
    pred_blend = (np.mean(probs, 0) > THRESHOLD).astype(np.uint8)

    pred_vote = glomeruli_vote(
        probs,
        img, 
        threshold_vote=THRESHOLD_VOTE,
        threshold_prob=THRESHOLD_PROB,
        threshold_max=THRESHOLD_MAX,
        plot=PLOT,
        iterations=0,
        k=3,
    )
    
    scores_blend.append(dice_scores_img(pred_blend, mask))
    scores_vote.append(dice_scores_img(pred_vote > THRESHOLD, mask))
    
    print(f'Score with blending : {scores_blend[-1] :.4f}')
    print(f'Score with voting :  {scores_vote[-1] :.4f}')
    
    
#     break

In [None]:
print(f'CV with blending : {np.mean(scores_blend) :.4f}')
print(f'CV with voting   :  {np.mean(scores_vote) :.4f}')

In [None]:
print(f'CV with blending : {np.mean(scores_blend) :.4f}')
print(f'CV with voting   :  {np.mean(scores_vote) :.4f}')

In [None]:
w = 1000
h = int(w *  mask.shape[0] / mask.shape[1])

fig = plot_contours_preds(img, pred_blend, pred_vote > THRESHOLD, w=2, downsize=2)

fig.update_layout(
    autosize=False,
    width=w,
    height=h,
)

fig.show()