In [None]:
import os
from py.helpers import BARRET_ROOT

LANS_DIR = os.path.join(BARRET_ROOT, 'LANS_001-923')
LANS_BIOP_DIRS = [os.path.join(BARRET_ROOT, 'p53_experiment_luuk_biopsy level_no HE', fol) for fol in ['P53_score_high_consensus', 'RBE_nummers Hans', 'RBE_nummers Onno']]

"""
'ASAP-Templates',
 'BOLERO',
 'CLASSIFIED',
 'EMR',
 'GE_spectrum_normaal_en_maligne',
 'KERATIN-ASL-carcinoom',
 'LANS isyntax',                                empty
 'LANS-remaining -presegmentatie',              segmentation 68 HE
 'LANS-Tissue level',                           segmentation 42 HE
 'LANS_001-923',                            865 HE + P53 from old scanner, 96 HE + P53 from new scanner, RL, where are the labels?
 'LANS_DATA_unclassified',                      Unclassified T and RL numbers?
 'LANS_missing_download',                       I guess we don't know the block numbers so don't know if classified or not
 'low consensus cases',                         Low consensus from the LANS dir, with excel sheet
 'Maastricht-OUD en NIEUW',                     RBE & RM
 'Nieuwe ASL p53 + HE voor luuk',               12 HE-P53 recolored tiff pairs, rASL, some additional problem cases
 'p53 Barrett ASL 21-11-22',                    Luuk's working folder (with patch datasets etc)
 'p53-Luuk -ASL',                               60 HE-P53 recolored tiff pairs, RM
 'p53_experiment_luuk_biopsy level_no HE',  in high_consensus 294 RL P53 with XML for biopsy level label, in RBE Hans and Onno: 94 + 94 RBE P53 with XML, total 488
 'p53_HE_sequential',                           5 HE-P53 adjacent tiff pairs with XML, ASL, RBE
 'p53_rating_reader_study_no HE',               P53 + XML but no results yet?
 'patch_dataset_Luuk',                          HE-P53 pairs of patches
 'RBE-00309_HE_sme_HE_20210310_184954.tiff',
 'RBE_to be CLASSIFIED_BY_SYBREN',
 'RL-0109',
 'SKMS casus e-learning',
 'training set 1 LANS paths VSI_TIF',
 'training set 2 LANS paths VSI_TIF',
 'VLE-OCT serie 1_51'
"""

casenames = {}
for d in LANS_BIOP_DIRS:
    tiffs = [f for f in os.listdir(d) if f.endswith('.tiff')]
    xmls = [f for f in os.listdir(d) if f.endswith('.xml')]
    print(f'{d}: {len(tiffs)} tiffs, {len(xmls)} xmls')

    # xml_names = set([''.join(f.split('_')[0].split('-')[:2]) for f in xmls])
    # tiff_names = set([''.join(f.split('_')[0].split('-')[:2]) for f in tiffs])
    xml_names = [f.split('.')[0] for f in xmls]
    tiff_names = [f.split('.')[0] for f in tiffs]

    # Find the names that are in both lists
    both = set(xml_names).intersection(set(tiff_names))
    print(f'Both: {len(both)}')

    casenames[d] = sorted(list(both))


In [None]:
from wholeslidedata.annotation.wholeslideannotation import WholeSlideAnnotation
from wholeslidedata.image.wholeslideimage import WholeSlideImage
from wholeslidedata.annotation.types import PolygonAnnotation as Polygon
from matplotlib import pyplot as plt
import numpy as np
import os
from tqdm import tqdm

import cv2

from py.helpers import get_outlines, get_area, get_patch, get_sub_areas, patch_empty, concat_one

In [None]:
spacing = 2.0

In [None]:
def get_all_annotated_cases(dir):
    filepaths = {f.split('.')[0]:{"wsi":None, "wsa":None} for f in os.listdir(dir)}
    for f in os.listdir(dir):
        case = f.split('.')[0]

        if ".tiff" in f:
            typ = "wsi"
        elif f.endswith(".xml"):
            typ = "wsa"
        else:
            continue

        filepaths[case][typ] = os.path.join(dir, f)
    return filepaths

In [None]:
cases = {d: get_all_annotated_cases(d) for d in LANS_BIOP_DIRS}

In [None]:
os.add_dll_directory(r'C:\Program Files\openslide-win64\bin')

In [None]:
def plot_sub_areas(wsi, sub_areas, area_labels=[], save_path="", spacing=2.0, figsize_factor=2, show_emptiness=False):
    nrows = len(sub_areas)
    ncols = len(sub_areas[0])
    fig, ax = plt.subplots(nrows,ncols, figsize=(ncols*figsize_factor,nrows*figsize_factor))
    for i in range(nrows):
        for j in range(ncols):
            if ncols < 2:
                index = i
            else:
                index = (i,j)

            if j < len(sub_areas[i]):
                sub_area = sub_areas[i][j]
                sub_patch = wsi.get_patch(*sub_area, spacing)
                
                # color = "red" if sub_patch.mean() < 10 else "black"
                # ax[i,j].text(105,128, f"{sub_patch.std():.2f}", c=color)
                if show_emptiness:
                    color = "red" if sub_patch.mean() > 223 else "black"
                    ax[index].text(105,128, f"{sub_patch.mean():.2f}", c=color)
                if len(area_labels) > 0:
                    ax[index].set_title(area_labels[i*nrows+j])
                ax[index].imshow(sub_patch)

            ax[index].axis("off")
    if save_path:
        plt.savefig(save_path, bbox_inches="tight")
        plt.close(fig)

d = LANS_BIOP_DIRS[0]
casename = casenames[d][0]
casepaths = cases[d][casename]
outlines = get_outlines(WholeSlideAnnotation(casepaths["wsa"])) # biopsy outlines
area = get_area(outlines[0], spacing) # biopsy area
sub_areas = get_sub_areas(area) # sub areas (patches) of biopsy area
wsi = WholeSlideImage(casepaths["wsi"]) # whole slide image
plot_sub_areas(wsi, sub_areas, show_emptiness=True)

# def save_all_sub_areas_plots(spacing, root=ROOT):
#     for casename, case in tqdm(get_all_annotated_cases(root).items()):
#         for coupe, paths in case.items():
#             outlines = get_outlines(WholeSlideAnnotation(paths["wsa"]))
#             for biopsy_nr, outline in enumerate(outlines):
#                 plot_sub_areas(
#                     WholeSlideImage(paths["wsi"]), 
#                     get_sub_areas(get_area(outline, spacing), spacing=spacing), 
#                     save_path=os.path.join(ROOT, "visualisation", f"sub_areas_{casename}_{biopsy_nr}_{coupe}.png"),
#                     spacing=spacing)

# save_all_sub_areas_plots(2, root=ROOT_ADJACENT)

In [None]:
d = LANS_BIOP_DIRS[0]
casename = casenames[d][2]
print("Dir: ", d)
print("Case: ", casename)
casepaths = cases[d][casename]
wsa = WholeSlideAnnotation(casepaths["wsa"])

labels = [a.label.name for a in wsa.annotations] # biopsy labels

outlines = get_outlines(wsa) # biopsy outlines
areas = [get_area(outline, spacing) for outline in outlines] # biopsy areas
half_areas_len = int(np.ceil(len(areas)/2))
areas = [[a for a in areas[:half_areas_len]], [a for a in areas[half_areas_len:]]]
wsi = WholeSlideImage(casepaths["wsi"]) # whole slide image
plot_sub_areas(wsi, areas, figsize_factor=5, area_labels=labels)

In [None]:
spacing = 0.25

biopsies = {}

for d in LANS_BIOP_DIRS:
    print("Dir: ", d)
    for casename in tqdm(casenames[d]):
        casepaths = cases[d][casename]
        wsa = WholeSlideAnnotation(casepaths["wsa"])

        labels = [a.label.name for a in wsa.annotations] # biopsy labels

        outlines = get_outlines(wsa) # biopsy outlines

        for b in range(len(labels)):
            outline = outlines[b]
            area = get_area(outline, spacing)

            biopsies[f"{casename}_b{b}"] = {
                "dir": d.split("\\")[-1],
                "casename": casename,
                "height": int(area[2]),
                "width": int(area[3]),
                "label": labels[b],
            }


Largest biopsy: 21376 x 19328	wildtype	413,155,328 pixels

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(biopsies, orient="index")
df["pixels"] = df["height"] * df["width"]

# Sort by pixels
df = df.sort_values(by=["pixels"], ascending=False)

# Show barchart of label, with counts, using plt.bar
plt.figure(figsize=(5,3))
plt.bar(df["label"].unique(), df["label"].value_counts())
plt.xticks(rotation=90)

# Display counts on top of bars
for i, v in enumerate(df["label"].value_counts()):
    plt.text(i-0.2, v+1, str(v))
plt.show()

# Display rows with label none
none_labels = df[df["label"] == "none"]
# Sort by casename
none_labels = none_labels.sort_values(by=["casename"])
# Only unique casenames
none_labels = none_labels.drop_duplicates(subset=["casename"])
display(none_labels)