# Optimising for low false negatives in tissue detector

In [None]:
# determinism preamble
from repath.utils.seeds import set_seed

global_seed = 123
set_seed(global_seed)

In [None]:
# paths preamble
from repath.utils.paths import project_root
experiment_root = project_root() / "experiments" / "tissue" / "low_fp"
experiment_root.mkdir(parents=True, exist_ok=True)

Our objective is to change the parameters and method of the background removal code so that it minimises false negatives. In otherwords it's unlikely to eliminate tissue that might be important. 

In [None]:
# first let's load in the dataset
from repath.data.datasets.tissue import tissue

tissue_dataset = tissue()
tissue_dataset.paths

Let's take a look at one of these slides with the annotations rendered on it

In [None]:
# we need some utility function first
from repath.utils.geometry import Size
from repath.utils.convert import to_frame_with_locations, np_to_pil

labels_level = 5
labels = {"background": 0, "tissue": 1}

def dimensions_at_level(level, slide):
    if level >= len(slide.dimensions):
        request_level = len(slide.dimensions) - 1
        lev_diff = level - request_level
        max_level_dim = slide.dimensions[-1]
        requested_level_size = Size(max_level_dim.width // 2 ** lev_diff, max_level_dim.height // 2 ** lev_diff)
        labels_shape = requested_level_size.as_shape()
    else:
        labels_shape = slide.dimensions[level].as_shape()
    return labels_shape

def label_patches(slide_path, annotation_path, label, tags):
    # work out the size of the labels image
    with tissue_dataset.slide_cls(slide_path) as slide:
        labels_shape = dimensions_at_level(labels_level, slide)

    # render the annotations into the labels image
    annotations = tissue_dataset.load_annotations(annotation_path)
    scale_factor = 2 ** labels_level
    labels_array = annotations.render(labels_shape, scale_factor)
    
    return labels_array
    
def as_frame(labels_array, replace_labels=False):
    # change the image into a pandas frame with the labels
    frame = to_frame_with_locations(labels_array, "label")
    if replace_labels:
        frame['label'] = frame['label'].replace( {v: k for k, v in labels.items()})
    
    return frame

def as_image(labels_array):
    return np_to_pil(labels_array)

In [None]:
import matplotlib.pyplot as plt
import matplotlib

slide_path, annotation_path, label, tags = tissue_dataset[0]

# create the thumbnail of the slide at level 5
with tissue_dataset.slide_cls(slide_path) as slide:
    thumbnail = slide.get_thumbnail(labels_level)
#print(thumbnail.shape)
#thumb_img = np_to_pil(thumbnail)

# create the labels image
labels_array = label_patches(slide_path, annotation_path, label, tags)
# 

print(labels_array.shape)
# labels_img = np_to_pil(labels_array)
# labels_img

cmap = matplotlib.colors.ListedColormap(["white", "yellow"])
plt.figure(figsize=(32,24))
plt.imshow(thumbnail)
plt.imshow(labels_array, alpha=0.5, cmap=cmap)
plt.axis('off')
plt.show()

Create a list of pandas frames with the labels for each patch. The labels can be background or tissue. We are going to do this at level 5.

In [None]:
from repath.utils.geometry import Size
from repath.utils.convert import to_frame_with_locations

patches_df_human_labels = [as_frame(label_patches(*s)) for s in tissue_dataset]

Next, we want to run a bunch of different tissue detectors at level 5 and compare their results to these patches.

In [None]:
from repath.preprocess.tissue_detection import *
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

def label_with_background_subtractor(subtractor, slide_path, annotation_path, label, tags):
    # load in the slide
    with tissue_dataset.slide_cls(slide_path) as slide:
        # pass the level 5 thumbnail into the background subtractor
        thumbnail = slide.get_thumbnail(5)
    labels_array = subtractor(thumbnail)
    return labels_array

def compute_results(df_computer_labels, slide_idx):
    # compate the two sets of labels
    y_true = patches_df_human_labels[slide_idx]['label'].to_numpy()
    y_pred = df_computer_labels['label'].to_numpy().astype('int')
    return y_true, y_pred
        
detectors = [TissueDetectorOTSU(), TissueDetectorGreyScale()]  
        
def test_subtractor(subtractor):
    # do the test
    patches_df_computer_labels = [as_frame(label_with_background_subtractor(subtractor, *s)) for s in tissue_dataset]
    results = [compute_results(df, idx) for idx, df in enumerate(patches_df_computer_labels)]
    y_true, y_pred = list(zip(*results))
    y_true, y_pred = np.concatenate(y_true), np.concatenate(y_pred)
    
    # compute the metrics
    results = (y_true, y_pred)
    return results

results = [test_subtractor(detector) for detector in detectors]

In [None]:
from repath.preprocess.tissue_detection.tissue_metrics import calc_tissue_conf_mat, get_output_images, write_contours_to_file



So now for each detector, we have a list of confusion matrices (ndarrays), one for each slide. To get an idea of the overall false negative rate, let's concatanate all the results and create one big confusion matrix for each detector. Then we can pull out the false negative rate and use that as an optimisation metric.

In [None]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt     

def metrics(y_true, y_pred):
    CM = confusion_matrix(y_true, y_pred)
    
    # invert the positive and negative cases because we are using a binary classifier
    CM = CM[[1,0],:]
    CM = CM[:, [1,0]]
    
    print(CM)
    
    TP = CM[0][0]
    FP = CM[1][0]
    TN = CM[1][1]
    FN = CM[0][1]

    # False negative rate
    FNR = FN/(TP+FN)

    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)
    
    return CM, ACC, FNR

all_metrics = [metrics(*res) for idx, res in enumerate(results)]
all_metrics

In [None]:
from repath.utils.metrics import conf_mat_plot_heatmap

results_df = pd.DataFrame(columns=['detector', 'accuracy', 'false negative rate'])

for idx, (cm, acc, fnc) in enumerate(all_metrics):
    ls = [type(detectors[idx]).__name__, acc, fnc]
    row = pd.Series(ls, index=results_df.columns)
    results_df = results_df.append(row, ignore_index=True)
    conf_mat_plot_heatmap(cm, ['tissue', 'background'], f'{ls[0]} Confusion Matrics')
    
results_df

# Assess the results of the tissue game

In [1]:
# determinism preamble
from repath.utils.seeds import set_seed

global_seed = 123
set_seed(global_seed)

# imports
import pandas as pd

In [2]:
# paths preamble
from repath.utils.paths import project_root
experiment_root = project_root() / "experiments" / "tissue" / "patch_game_analysis"
experiment_root.mkdir(parents=True, exist_ok=True)

In [3]:
# laod in the results for patches that are marked by everyone
multiple_human_labels = pd.read_csv(experiment_root / "multiple_labeller_patches.csv")
multiple_human_labels

Unnamed: 0,x,y,slide_path,label,patch_no,cf,gb,jf,mm,pk,sb,tp
0,101120,62208,icaird_blood/images/IC-EN-02125-01.isyntax,background,0,background,background,background,background,background,background,background
1,79744,90368,icaird_tissue/images/IC-EN-02079-01.isyntax,tissue,1,tissue,tissue,tissue,tissue,tissue,tissue,tissue
2,130304,42112,icaird_tissue/images_for_game/IC-CX-02343-01.i...,tissue,2,tissue,tissue,tissue,tissue,tissue,tissue,tissue
3,0,5504,icaird_tissue/images/IC-CX-00004-01.isyntax,tissue,3,background,background,background,background,background,background,background
4,128,21248,icaird_tissue/images/IC-CX-00004-01.isyntax,tissue,4,background,background,background,background,background,background,background
...,...,...,...,...,...,...,...,...,...,...,...,...
995,14848,2560,icaird_blood/images/IC-EN-02091-01.isyntax,background,995,background,background,background,background,background,background,background
996,36096,7040,icaird_tissue/images/IC-EN-02079-01.isyntax,background,996,background,background,background,background,background,background,background
997,78080,32384,icaird_blood/images/IC-EN-02175-01.isyntax,tissue,997,tissue,tissue,tissue,tissue,tissue,tissue,tissue
998,30208,72832,icaird_tissue/images/IC-EN-02079-01.isyntax,tissue,998,tissue,tissue,tissue,tissue,tissue,tissue,tissue


In [4]:
# load in the results for the patches that are marked by one person
human_patch_labels = pd.read_csv(experiment_root / "human_patch_labels.csv")

# cut out the NaN rows
human_patch_labels = human_patch_labels.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
human_patch_labels

Unnamed: 0,x,y,slide_path,label,patch_no,human_label
2000,52096,51072,icaird_blood/images/IC-EN-02132-01.isyntax,tissue,2000,tissue
2001,66816,78080,icaird_blood/images/IC-EN-02175-01.isyntax,background,2001,background
2002,16640,24576,icaird_blood/images/IC-EN-02091-01.isyntax,tissue,2002,tissue
2003,100608,60800,icaird_blood/images/IC-EN-02091-01.isyntax,tissue,2003,tissue
2004,36352,3200,icaird_tissue/images/IC-EN-02079-01.isyntax,background,2004,tissue
...,...,...,...,...,...,...
9995,67712,23040,icaird_tissue/images/IC-EN-02079-01.isyntax,background,9995,background
9996,68480,59648,icaird_blood/images/IC-EN-02192-01.isyntax,tissue,9996,tissue
9997,49536,21504,icaird_tissue/images/IC-EN-02092-01.isyntax,background,9997,background
9998,40960,4992,icaird_tissue/images/IC-EN-02079-01.isyntax,background,9998,background


In [5]:
# do the background subtraction for a slide
from repath.preprocess.tissue_detection import *
from repath.utils.convert import to_frame_with_locations
from repath.data.slides.isyntax import Slide

data_dir = project_root() / 'data'

def as_frame(labels_array, replace_labels=False):
    # change the image into a pandas frame with the labels
    frame = to_frame_with_locations(labels_array, "label")
    if replace_labels:
        frame['label'] = frame['label'].replace( {v: k for k, v in labels.items()})    
    return frame

def label_with_background_subtractor(subtractor, slide_path):
    print(f"labelling: {slide_path}")
    
    # load in the slide
    with Slide(data_dir / slide_path) as slide:
        # pass the level 5 thumbnail into the background subtractor
        thumbnail = slide.get_thumbnail(5)
    labels_array = subtractor(thumbnail)
    df = as_frame(labels_array)
    
    # true == tissue and false == background
    df.loc[df['label'] == True, 'label'] = 'tissue'
    df.loc[df['label'] == False, 'label'] = 'background'
    
    return slide_path, df

def apply_subtractor(subtractor, slide_paths):
    return dict([label_with_background_subtractor(subtractor, s) for s in slide_paths])

In [6]:
# first let's load in the dataset
# from repath.data.datasets.tissuegame import tissue_game
# tissue_dataset = tissue_game()

# get the paths to each of the unique slides in the human_patch_labels data frame
slide_paths = human_patch_labels['slide_path'].unique().tolist()

# correct for the missing files
corrected_paths = ['icaird_blood/images_valid/IC-EN-02264-01.isyntax', 'icaird_blood/images_test/IC-EN-02262-01.isyntax', 
                   'icaird_blood/images_valid/IC-EN-02152-01.isyntax', 'icaird_blood/images_valid/IC-EN-02254-01.isyntax']
# slide_paths = slide_paths + corrected_paths

# check all the slide paths exist
slide_paths_exist = [sp for sp in slide_paths if (data_dir / sp).exists()]
slide_paths_missing  = [sp for sp in slide_paths if not (data_dir / sp).exists()]
print(f"missing slide paths {slide_paths_missing}")

# set up an array of detectors
subtractors = [TissueDetectorOTSU(), TissueDetectorGreyScale()] 
list_of_path_to_df_dicts = [apply_subtractor(subtractor, slide_paths_exist) for subtractor in subtractors]
# list_of_path_to_df_dicts #  this is a list with a dict for each subtractor

missing slide paths ['icaird_blood/images/IC-EN-02264-01.isyntax', 'icaird_blood/images/IC-EN-02262-01.isyntax', 'icaird_blood/images/IC-EN-02152-01.isyntax', 'icaird_blood/images/IC-EN-02254-01.isyntax']
labelling: icaird_blood/images/IC-EN-02132-01.isyntax
labelling: icaird_blood/images/IC-EN-02175-01.isyntax
labelling: icaird_blood/images/IC-EN-02091-01.isyntax
labelling: icaird_tissue/images/IC-EN-02079-01.isyntax
labelling: icaird_blood/images/IC-EN-02104-01.isyntax
labelling: icaird_blood/images/IC-EN-02263-01.isyntax
labelling: icaird_tissue/images/IC-EN-02092-01.isyntax
labelling: icaird_blood/images/IC-EN-02131-01.isyntax
labelling: icaird_blood/images/IC-EN-02191-01.isyntax
labelling: icaird_tissue/images_for_game/IC-CX-01561-01.isyntax
labelling: icaird_blood/images/IC-EN-02146-01.isyntax
labelling: icaird_blood/images/IC-EN-02126-01.isyntax
labelling: icaird_tissue/images/IC-CX-00005-01.isyntax
labelling: icaird_blood/images/IC-EN-02121-01.isyntax
labelling: icaird_blood/im

In [9]:
from sklearn.metrics import classification_report

# for each subtractor
for idx, path_to_df_dict in enumerate(list_of_path_to_df_dicts):
    print(f"Subtractor {idx}")
    # iterate over the human marked up patches and compare them to the subtractor output
    targets = []
    predictions = []
    correct = 0
    total = 0
    slide_missing = 0
    for row in human_patch_labels.itertuples():
        x, y, label = row.x // 128, row.y // 128, row.label
        if row.slide_path in path_to_df_dict:
            #print(f"Human prediction for {x}, {y} is {label}")
            predictions_df = path_to_df_dict[row.slide_path]
            # print(predictions_df)
            prediction_row = predictions_df[np.logical_and(predictions_df['column'] == x, predictions_df['row'] == y)]
            #print(prediction_row)
            
            targets.append(label)
            predictions.append(prediction_row['label'])
            total += 1
            if prediction_row['label'].item() == label:
                correct += 1
        else:
            slide_missing += 1
            
    print(f"{len(targets)}, {len(predictions)}")
    print(f"correct: {correct} / {len(targets)} = {correct/len(targets)}")
    

Subtractor 0
6491, 6491
correct: 3241 / 6491 = 0.4993067323987059
Subtractor 1
6491, 6491
correct: 3251 / 6491 = 0.5008473270682483
