In [None]:
from wholeslidedata.annotation.wholeslideannotation import WholeSlideAnnotation
from wholeslidedata.image.wholeslideimage import WholeSlideImage
from wholeslidedata.annotation.types import PolygonAnnotation as Polygon
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from tqdm import tqdm

import cv2

from py.helpers import get_outlines, get_area, get_patch, get_sub_areas, patch_empty, concat_one, BARRET_ROOT
import os

os.add_dll_directory(r'C:\Program Files\openslide-win64\bin') # for openslide

LANS_DIR = os.path.join(BARRET_ROOT, 'LANS_001-923')
LANS_BIOP_ROOT = os.path.join(BARRET_ROOT, 'p53_experiment_luuk_biopsy level_no HE')
LANS_BIOP_DIR = os.path.join(LANS_BIOP_ROOT, 'P53_score_high_consensus')

In [None]:
d = LANS_BIOP_DIR
tiffs = [f for f in os.listdir(d) if f.endswith('.tiff')]
xmls = [f for f in os.listdir(d) if f.endswith('.xml')]
print(f'{d}: {len(tiffs)} tiffs, {len(xmls)} xmls')

xml_names = [f.split('.')[0] for f in xmls]
tiff_names = [f.split('.')[0] for f in tiffs]

# Find the names that are in both lists
both = set(xml_names).intersection(set(tiff_names))
print(f'Both: {len(both)}')

casenames = sorted(list(both))

In [None]:
def get_all_annotated_cases(dir):
    filepaths = {f.split('.')[0]:{"wsi":None, "wsa":None} for f in os.listdir(dir)}
    for f in os.listdir(dir):
        case = f.split('.')[0]

        if ".tiff" in f:
            typ = "wsi"
        elif f.endswith(".xml"):
            typ = "wsa"
        else:
            continue

        filepaths[case][typ] = os.path.join(dir, f)
    return filepaths

In [None]:
cases = get_all_annotated_cases(LANS_BIOP_DIR)

In [None]:
def plot_sub_areas(wsi, sub_areas, area_labels=[], save_path="", spacing=2.0, figsize_factor=2, show_emptiness=False):
    nrows = len(sub_areas)
    ncols = len(sub_areas[0])
    fig, ax = plt.subplots(nrows,ncols, figsize=(ncols*figsize_factor,nrows*figsize_factor))
    for i in range(nrows):
        for j in range(ncols):
            if ncols < 2:
                index = i
            else:
                index = (i,j)

            if j < len(sub_areas[i]):
                sub_area = sub_areas[i][j]
                sub_patch = wsi.get_patch(*sub_area, spacing)
                
                # color = "red" if sub_patch.mean() < 10 else "black"
                # ax[i,j].text(105,128, f"{sub_patch.std():.2f}", c=color)
                if show_emptiness:
                    color = "red" if sub_patch.mean() > 223 else "black"
                    ax[index].text(105,128, f"{sub_patch.mean():.2f}", c=color)
                if len(area_labels) > 0:
                    ax[index].set_title(area_labels[i*ncols+j])
                ax[index].imshow(sub_patch)

            ax[index].axis("off")
    if save_path:
        plt.savefig(save_path, bbox_inches="tight")
        plt.close(fig)


# def save_all_sub_areas_plots(spacing, root=ROOT):
#     for casename, case in tqdm(get_all_annotated_cases(root).items()):
#         for coupe, paths in case.items():
#             outlines = get_outlines(WholeSlideAnnotation(paths["wsa"]))
#             for biopsy_nr, outline in enumerate(outlines):
#                 plot_sub_areas(
#                     WholeSlideImage(paths["wsi"]), 
#                     get_sub_areas(get_area(outline, spacing), spacing=spacing), 
#                     save_path=os.path.join(ROOT, "visualisation", f"sub_areas_{casename}_{biopsy_nr}_{coupe}.png"),
#                     spacing=spacing)

# save_all_sub_areas_plots(2, root=ROOT_ADJACENT)

Example of patching

In [None]:
spacing = 2.0

d = LANS_BIOP_DIR
casename = casenames[0]
print(casename)
casepaths = cases[casename]
outlines = get_outlines(WholeSlideAnnotation(casepaths["wsa"])) # biopsy outlines
area = get_area(outlines[0], spacing) # biopsy area
sub_areas = get_sub_areas(area) # sub areas (patches) of biopsy area
wsi = WholeSlideImage(casepaths["wsi"]) # whole slide image
plot_sub_areas(wsi, sub_areas, show_emptiness=True)

Example of showing biopsies

In [None]:
spacing = 32.0

d = LANS_BIOP_DIR
casename = casenames[5]
print("Dir: ", d)
print("Case: ", casename)
casepaths = cases[casename]
wsa = WholeSlideAnnotation(casepaths["wsa"])

labels = [a.label.name for a in wsa.annotations] # biopsy labels

outlines = get_outlines(wsa) # biopsy outlines
areas = [get_area(outline, spacing) for outline in outlines] # biopsy areas
# Append labels with (width, height) of biopsy
labels = [f"{label} ({int(area[2])}, {int(area[3])})" for label, area in zip(labels, areas)]
half_areas_len = int(np.ceil(len(areas)/2))
areas = [[a for a in areas[:half_areas_len]], [a for a in areas[half_areas_len:]]]
wsi = WholeSlideImage(casepaths["wsi"]) # whole slide image
plot_sub_areas(wsi, areas, figsize_factor=5, area_labels=labels, spacing=spacing)

Gather data on biopsies

In [None]:
CREATE_DATASET = False

In [None]:
# spacing = 0.5
spacing  = 32.0

if not CREATE_DATASET:
    biopsies = {}

    d = LANS_BIOP_DIR
    print("Dir: ", d)
    for casename in tqdm(casenames):    
        casepaths = cases[casename]
        wsa = WholeSlideAnnotation(casepaths["wsa"])

        labels = [a.label.name for a in wsa.annotations] # biopsy labels

        outlines = get_outlines(wsa) # biopsy outlines

        for b in range(len(labels)):
            outline = outlines[b]
            area = get_area(outline, spacing)

            biopsies[f"{casename}_b{b}"] = {
                "dir": d.split("\\")[-1],
                "casename": casename,
                "height": int(area[2]),
                "width": int(area[3]),
                "label": labels[b],
            }


Analyze biopsy distribution

Largest biopsy: 21376 x 19328	wildtype	413,155,328 pixels at 0.25 m/p

5344 x 4832	wildtype	25,822,208 at 1.0 m/p

In [None]:
if not CREATE_DATASET:
    df = pd.DataFrame.from_dict(biopsies, orient="index")
    df["pixels"] = df["height"] * df["width"]

    # Sort by pixels
    df = df.sort_values(by=["pixels"], ascending=False)

    display(df)

    # Show barchart of label, with counts, using plt.bar
    plt.figure(figsize=(5,3))
    plt.bar(df["label"].unique(), df["label"].value_counts())
    plt.xticks(rotation=90)

    # Display counts on top of bars
    for i, v in enumerate(df["label"].value_counts()):
        plt.text(i-0.2, v+1, str(v))
    plt.show()

    def get_first_letters(s):
        """Return every letter before the first non-alpha char. For example: 'RL1' -> 'RL'"""
        for i, c in enumerate(s):
            if not c.isalpha():
                return s[:i]
        return s

    # Count how many RL or RBE numbers there are (in the casename), to make a barchart of that
    df["RL VS RBE"] = df["casename"].apply(lambda x: get_first_letters(x))
    plt.figure(figsize=(5,3))
    plt.bar(df["RL VS RBE"].unique(), df["RL VS RBE"].value_counts())
    plt.xticks(rotation=90)

    # Display counts on top of bars
    for i, v in enumerate(df["RL VS RBE"].value_counts()):
        plt.text(i-0.2, v+1, str(v))
    plt.show()


    # Make a stacked barchart of RL VS RBE and label
    df["count"] = 1
    df_stacked = df.groupby(["RL VS RBE", "label"]).sum()
    df_stacked = df_stacked.reset_index()
    df_stacked = df_stacked.pivot(index="RL VS RBE", columns="label", values="count")
    df_stacked = df_stacked.fillna(0)
    df_stacked.plot.bar(stacked=True, figsize=(10,5))
    plt.xticks(rotation=90)

    # Display counts on top of each segment of the bars
    for i, v in enumerate(df_stacked.values.flatten()):
        ncols = len(df_stacked.columns)
        bar_index = i // ncols
        label_index = i % ncols
        x = bar_index + label_index * 0.1
        # Make sure the y value is based on the previous values in the stack of the same bar (so not the other bars)
        y = sum(df_stacked.values.flatten()[bar_index*ncols:bar_index*ncols+i % len(df_stacked.columns)]) + v/2
        plt.text(x-0.15, y, str(int(v)), ha="center", va="center")
    plt.show()


    # Make the same chart but now with percentages
    df_stacked = df_stacked.div(df_stacked.sum(axis=1), axis=0)
    df_stacked.plot.bar(stacked=True, figsize=(10,5))
    plt.xticks(rotation=90)

    # Display percentages on top of each segment of the bars
    for i, v in enumerate(df_stacked.values.flatten()):
        ncols = len(df_stacked.columns)
        bar_index = i // ncols
        label_index = i % ncols
        x = bar_index
        # Make sure the y value is based on the previous values in the stack of the same bar (so not the other bars)
        y = sum(df_stacked.values.flatten()[bar_index*ncols:bar_index*ncols+i % len(df_stacked.columns)]) + v / 2
        plt.text(x, y, f"{v*100:.2f}%", ha="center", va="center")
    plt.show()


    # # Display rows with label none (They're all solved now :) )
    # none_labels = df[df["label"] == "none"]
    # # Sort by casename
    # none_labels = none_labels.sort_values(by=["casename"])
    # # Only unique casenames
    # none_labels = none_labels.drop_duplicates(subset=["casename"])
    # display(none_labels)

In [None]:
if not CREATE_DATASET:
    max_size = 9000
    max_size = 128

    # Select cases with height and width < 9000
    small_biopsies = df[(df["height"] < max_size) & (df["width"] < max_size)]

    display(small_biopsies)

    # Same plot as before, but now with small biopsies
    plt.figure(figsize=(5,3))
    plt.bar(small_biopsies["label"].value_counts().keys(), small_biopsies["label"].value_counts())
    plt.xticks(rotation=90)

    for i, v in enumerate(small_biopsies["label"].value_counts()):
        plt.text(i-0.2, v+1, str(v))
    plt.show()

Create dataset

In [None]:
spacing = 32.0

CREATE_DATASET = True
if CREATE_DATASET:
    destination = os.path.join(LANS_BIOP_ROOT, "dataset_fullsize")
    dest_biop = os.path.join(destination, f"biopsies_s{spacing}")
    os.makedirs(dest_biop, exist_ok=True)

    biopsy_df = {}
    errors = []

    d = LANS_BIOP_DIR
    print("Dir: ", d)
    for casename in tqdm(casenames):
        casepaths = cases[casename]
        wsa = WholeSlideAnnotation(casepaths["wsa"])

        labels = [a.label.name for a in wsa.annotations] # biopsy labels

        outlines = get_outlines(wsa) # biopsy outlines
        try:
            wsi = WholeSlideImage(casepaths["wsi"]) # whole slide image
        except Exception as e:
            errors.append((casepaths["wsi"], e))
            continue

        for b in range(len(labels)):
            outline = outlines[b]
            area = get_area(outline, spacing)

            biopsy_name = f"{casename}_b{b}"
            biopsy_df[biopsy_name] = {
                "dir": d,
                "casename": casename,
                "height": int(area[2]),
                "width": int(area[3]),
                "label": labels[b],
            }

            if os.path.exists(os.path.join(dest_biop, f"{biopsy_name}.png")):
                continue

            # Save biopsy with cv2
            biopsy = wsi.get_patch(*area, spacing)

            cv2.imwrite(os.path.join(dest_biop, f"{biopsy_name}.png"), biopsy)

    print("Errors: ", len(errors))
    display(errors)

    biopsy_df = pd.DataFrame.from_dict(biopsy_df, orient="index")
    biopsy_df["pixels"] = biopsy_df["height"] * biopsy_df["width"]

    # Save to csv
    biopsy_df.to_csv(os.path.join(destination, "biopsy_labels.csv"))

In [None]:
# Count number of casenames with BIG in it
print("BIG: ", sum([1 for c in casenames if "BIG" in c]))

errors: 19, each and every BIG tiff

 clearly something about the BIG tiffs

# Prepare data

In [None]:
if CREATE_DATASET:
    print("spacing: ", spacing)

    # Use the saved csv to make a prepared dataset, with one folder containing all biopsies with names like 0.png 1.png etc., and a csv with the labels
    # Open the csv
    df = pd.read_csv(os.path.join(LANS_BIOP_ROOT, "dataset_fullsize", "biopsy_labels.csv"), index_col=0)

    # For every file in biopsies_s4.0, copy it to biopsies_s4.0_anon, and rename it to the number of the row in the csv (so 0.png, 1.png etc.) (NOT the index)
    source = os.path.join(LANS_BIOP_ROOT, "dataset_fullsize", f"biopsies_s{spacing}")
    destination = os.path.join(LANS_BIOP_ROOT, "dataset_fullsize", f"biopsies_s{spacing}_anon")
    os.makedirs(destination, exist_ok=True)

    # Enumerate over the rows in the csv, NOT the index
    df_clean = df.copy().reset_index()
    for i, row in tqdm(df_clean.iterrows(), total=len(df)):
        # Get the biopsy name
        biopsy_name = row["index"]
        # Get the label
        label = row["label"]
        # Get the source path
        source_path = os.path.join(source, f"{biopsy_name}.png")
        # Get the destination path
        destination_path = os.path.join(destination, f"{i}.png")
        # Copy the file
        os.system(f'copy "{source_path}" "{destination_path}"')

    if not os.path.exists(os.path.join(LANS_BIOP_ROOT, "dataset_fullsize", "biopsy_labels_anon.csv")):
        # Drop all unnecessary columns, only keeping the index and the label
        df_clean = df_clean.drop(columns=["dir", "casename", "height", "width", "pixels", "index"])

        # Map labels to numbers according to the following mapping:
        mapping = {
            "wildtype": 0,
            "overexpression": 1,
            "nullmutation": 2,
            "doubleclones": 3,
        }
        df_clean["label"] = df_clean["label"].map(mapping)
        # It should be an integer
        df_clean["label"] = df_clean["label"].astype(int)

        # Name the index column "id" and then don't save the index
        df_clean.index.name = "id"
        df_clean.to_csv(os.path.join(LANS_BIOP_ROOT, "dataset_fullsize", "biopsy_labels_anon.csv"))
    else:
        print("biopsy_labels_anon.csv already exists. It's reusable for different spacings, so you don't have to create it again.")