**About** : This notebook is used to analyse a more in-dpeth analysis of model performance.

In [None]:
%load_ext autoreload
%autoreload 2

## Initialization

### Imports

In [None]:
import os
import sys
import cv2
import json
import glob
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from tqdm.notebook import tqdm

sys.path.append("../code/")

In [None]:
from params import *
from utils.rle import *

from data.dataset import load_image

from utils.metrics import dice_scores_img
from utils.plots import plot_heatmap_preds, plot_contours_preds

### Data

In [None]:
THRESHOLD = 0.5

In [None]:
df_info = pd.read_csv(DATA_PATH + f"HuBMAP-20-dataset_information.csv")
df_mask = pd.read_csv(DATA_PATH + "train_2_fix.csv")
df_test = pd.read_csv(DATA_PATH + "sample_submission.csv")

### Experiment

In [None]:
log_folder = "../logs/2021-05-06/1/"  # b1

In [None]:
class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)

config = json.load(open(log_folder + 'config.json', 'r'))
config = Config(**config)

In [None]:
preds = glob.glob(log_folder + "pred_*.npy")

### Centers

In [None]:
def get_centers_pred(pred, threshold=0.5):
    mask = (pred > threshold).astype(np.uint8)
    _, components, boxes = cv2.connectedComponentsWithStats(mask, connectivity=8)[:3]
    centers, maxs = [], []
    
    for i, b in enumerate(boxes[1:]):
        box = (
            max(b[0] - 1, 0), 
            max(b[1] - 1, 0), 
            min(b[0] + b[2] + 1, mask.shape[1]),
            min(b[1] + b[3] + 1, mask.shape[0]),
        )
        
        if b[-1] < 100: # remove artefacts
            continue

        centers.append(np.array([(box[3] + box[1]) / 2, (box[0] + box[2]) / 2])[None, :])
        
        comp = (components == i + 1)[box[1]: box[3], box[0]: box[2]]
        component_probas = pred[box[1]: box[3], box[0]: box[2]] * comp
        maxs.append(component_probas.max())
    
    return np.concatenate(centers), maxs

In [None]:
def get_centers_truth(json):
    centers = []
    for gt in json:
        min_ = np.min(gt['geometry']['coordinates'], 1)[:, ::-1]
        max_ = np.max(gt['geometry']['coordinates'], 1)[:, ::-1]
        
        centers.append((min_ + max_) / 2)
        
    return np.concatenate(centers)

In [None]:
def count_detected_glomerulis(centers, mask):
    return np.sum([mask[tuple(center.astype(int))] for center in centers])

In [None]:
def compute_glomeruli_level_stats(mask, gt_json, pred, reduce=1):
    pred_centers, _ = get_centers_pred(pred, threshold=0.5)
    truth_centers = get_centers_truth(gt_json) / reduce
    
    tp = count_detected_glomerulis(pred_centers, mask)
    fn = max(0, len(truth_centers) - tp)
    fp = max(0, len(pred_centers) - count_detected_glomerulis(truth_centers, pred))
    
    return tp, int(fn), int(fp)

In [None]:
if not os.path.exists(log_folder + "metrics.json"):
    metrics = {
        "recall": [],
        "precision": [],
        "f1": [],
        "acc": []
    }

    for mask_name in tqdm(df_mask['id'].values):
        gt_json = json.load(open(TIFF_PATH + mask_name + ".json", "r"))

        idx = [i for i, path in enumerate(preds) if mask_name in path][0]
        probs = np.load(preds[idx]).astype(np.float32)

        if len(probs.shape) == 3:
            probs = probs.mean(0)

        pred = (probs > THRESHOLD).astype(np.uint8)

        rle = df_mask[df_mask['id'] == mask_name]['encoding']
        mask = enc2mask(rle, (pred.shape[1], pred.shape[0]))

        tp, fn, fp = compute_glomeruli_level_stats(mask, gt_json, pred, reduce=2)

        recall = tp / (tp + fn)
        prec = tp / (tp + fp)
        f1 = 2 * prec * recall / (prec + recall)
        acc = tp / (tp + fp + fn)

        metrics['recall'].append(recall)
        metrics['f1'].append(f1)
        metrics['precision'].append(prec)
        metrics['acc'].append(acc)

    json.dump(metrics, open(log_folder + "metrics.json", "w"))
else:
    metrics = json.load(open(log_folder + "metrics.json", "r"))

In [None]:
df = pd.DataFrame.from_dict(metrics)
df.mean()

### Plot

In [None]:
df_mask['id'].values

In [None]:
mask_name = "8242609fa" 
gt_json = json.load(open(TIFF_PATH + mask_name + ".json", "r"))

idx = [i for i, path in enumerate(preds) if mask_name in path][0]
probs = np.load(preds[idx]).astype(np.float32)

if len(probs.shape) == 3:
    probs = probs.mean(0)

pred = (probs > THRESHOLD).astype(np.uint8)

img = load_image(os.path.join(TIFF_PATH_2, mask_name + ".tiff"), full_size=False, reduce_factor=2)

rle = df_mask[df_mask['id'] == mask_name]['encoding']
mask = enc2mask(rle, (img.shape[1], img.shape[0]))

In [None]:
# tp, fn, fp = compute_glomeruli_level_stats(mask, gt_json, pred, reduce=2)

# print('Number of glomeruli :', len(gt_json))
# print('TP :', tp)
# print('FP :', fp)
# print('FN :', fn)

# recall = tp / (tp + fn)
# prec = tp / (tp + fp)
# f1 = 2 * prec * recall / (prec + recall)
# acc = tp / (tp + fp + fn)

# print(f'Accuracy : {acc :.4f}')
# print(f'Precision : {prec :.4f}')
# print(f'Recall : {recall :.4f}')
# print(f'F1 score : {f1 :.4f}')

### Viz

In [None]:
centers_pred, maxs = get_centers_pred(probs)
centers_truth = get_centers_truth(gt_json) / 2

In [None]:
downsize = 4
w = 2 if downsize == 4 else 1
fig = plot_contours_preds(
    img, 
    pred, 
    mask=None, 
    w=w, 
    downsize=downsize
)

# fig.add_trace(
#     go.Scatter(x=centers[:, 0] / downsize, y=centers[:, 1] / downsize, mode='markers', name="Truth Center")
# )

fig.add_trace(
    go.Scatter(
        x=centers_pred[:, 1] / downsize, 
        y=centers_pred[:, 0] / downsize, 
        mode='markers', 
        name="Pred Center", 
        text=[f"Confidence : {m:.2f}" for m in maxs],
        marker_color='rgba(10, 230, 10, .9)'
    )
)

w = 1000
h = int(w *  mask.shape[0] / mask.shape[1])

fig.update_layout(
    autosize=False,
    width=w,
    height=h,
)


fig.show()