In [1]:
from wholeslidedata.annotation.wholeslideannotation import WholeSlideAnnotation
from wholeslidedata.image.wholeslideimage import WholeSlideImage
from wholeslidedata.annotation.types import PolygonAnnotation as Polygon
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from tqdm import tqdm

import cv2

from py.helpers import get_outlines, get_area, get_patch, get_sub_areas, patch_empty, concat_one, BARRET_ROOT
import os

os.add_dll_directory(r'C:\Program Files\openslide-win64\bin') # for openslide

LANS_DIR = os.path.join(BARRET_ROOT, 'LANS_001-923')
LANS_BIOP_ROOT = os.path.join(BARRET_ROOT, 'p53_experiment_luuk_biopsy-level_no-HE')
LANS_BIOP_DIR = os.path.join(LANS_BIOP_ROOT, 'P53_score_high_consensus')
BOLERO_DIR = os.path.join(BARRET_ROOT, 'BOLERO', 'P53 Bolero')

In [2]:
# Load labels for the initial high consensus dataset
first_dataset = pd.read_csv(os.path.join(LANS_BIOP_ROOT, 'dataset_fullsize', 'biopsies_s1.0_data', 'biopsy_labels_s1.0.csv'))
first_dataset.drop('dir', axis=1, inplace=True)
display(first_dataset.head())
first_dataset_case_nrs = first_dataset['casename'].unique().tolist()
print(first_dataset_case_nrs[:5])
print(len(first_dataset_case_nrs))

Unnamed: 0.1,Unnamed: 0,casename,height,width,label,pixels
0,RBE-00101-P53-I_b0,RBE-00101-P53-I,2675,1694,nullmutation,4531450
1,RBE-00101-P53-I_b1,RBE-00101-P53-I,3824,2568,nullmutation,9820032
2,RBE-00102-P53-I_b0,RBE-00102-P53-I,3664,2000,nullmutation,7328000
3,RBE-00102-P53-I_b1,RBE-00102-P53-I,4463,1996,nullmutation,8908148
4,RBE-00102-P53-I_b2,RBE-00102-P53-I,3680,2224,nullmutation,8184320


['RBE-00101-P53-I', 'RBE-00102-P53-I', 'RBE-00103-P53-II', 'RBE-00104-P53-III', 'RBE-00105-P53-II']
419


In [3]:
# Load labels for pathxl dataset
labels = pd.read_csv(os.path.join(LANS_BIOP_ROOT, 'pathxl_study_slide_labels.csv'))
display(labels.head())
pathxl_case_nrs = labels['case_nr'].to_list()
print(pathxl_case_nrs[:5])
print(len(pathxl_case_nrs))

Unnamed: 0,case_nr,label,nr_biopsies,nr_raters,concordance %,biopsies
0,RL-0013-1-II-P53,Null,5,4,75,"(NM75,WT100,WT100,WT100,WT100)"
1,RL-0027-I-P53,Double clones,4,4,75,"(WT100,WT100,WT100,DC75)"
2,RL-0036-III-P53,Overexpression,2,4,100,"(OE100,OE100)"
3,RL-0042-III-P53_1,Double clones,4,4,75,"(WT75,DC75,OE100,OE100)"
4,RL-0042-IV-P53_1,Overexpression,4,4,75,"(OE75,WT100,WT100,WT100)"


['RL-0013-1-II-P53', 'RL-0027-I-P53', 'RL-0036-III-P53', 'RL-0042-III-P53_1', 'RL-0042-IV-P53_1']
163


In [20]:
# Check the overlap between the two datasets
overlap = set(first_dataset_case_nrs).intersection(set(pathxl_case_nrs))
print(len(overlap))

0


In [4]:
# Get row for case_nr RL-0345-P53-1
case_nr = 'RL-0345-P53-1'
case_row = labels[labels['case_nr'] == case_nr]
display(case_row)

Unnamed: 0,case_nr,label,nr_biopsies,nr_raters,concordance %,biopsies
28,RL-0345-P53-1,Double clones,2,4,75,"(NM100,DC75)"


In [5]:
# folders = os.listdir(LANS_BIOP_ROOT)
# folders = [f for f in folders if os.path.isdir(os.path.join(LANS_BIOP_ROOT, f))]
folders = [
    "2nd Opinion Required",
    os.path.join("final_test_set_(low_consensus_second_look)", "not_ready", "duo_28_hans"),
    os.path.join("final_test_set_(low_consensus_second_look)", "not_ready", "duo_28_sybren"),
    os.path.join("final_test_set_(low_consensus_second_look)", "not_ready", "pathxl"),
    # "p53 dubbel ISYNTAX", # Only isyntax
    "P53_niet_duidelijk",
    "P53_score_high_consensus",
    os.path.join("P53_score_high_consensus", "not ready"),
    "P53_score_te_lage_consensus",
    "P53_te_weinig_beoordelingen",
    "te corrigeren_nummers Hans",
    "te corrigeren_nummers Sybren",
    # "Uitzoeken onno ASL", # Only isyntax
    os.path.join("voor Onno", "2 obversers necessary"),
    os.path.join("voor Onno", "concordance_high_new"),
    os.path.join("voor Onno", "observer group necessary"),
    # Upper folders
    os.path.join("..", "low consensus cases")
]

In [21]:
# Make a matrix of folders x folders with the number of cases in common like the string "common / total"
common_cases = np.zeros((len(folders), len(folders)), dtype=object)
tiffs_per_folder = {}
for i, folder1 in enumerate(folders):
    folder1_cases = [f for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder1)) if f.endswith('.tiff') or f.endswith('.isyntax')]
    tiffs_per_folder[folder1] = len(folder1_cases)
    for j, folder2 in enumerate(folders):
        if i == j:
            common_cases[i, j] = 'X'
            continue
        folder2_cases = [f for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder2)) if f.endswith('.tiff') or f.endswith('.isyntax')]
        common_cases[i, j] = f"{len(set(folder1_cases) & set(folder2_cases))} / {len(folder1_cases)}"
folders_display = [f.split(os.sep)[-1].replace('_',' ')+f" ({tiffs_per_folder[f]})" for f in folders]
common_cases_df = pd.DataFrame(common_cases, index=folders_display, columns=folders_display)
display(common_cases_df)

Unnamed: 0,2nd Opinion Required (32),duo 28 hans (28),duo 28 sybren (28),pathxl (161),P53 niet duidelijk (4),P53 score high consensus (442),not ready (20),P53 score te lage consensus (38),P53 te weinig beoordelingen (20),te corrigeren nummers Hans (58),te corrigeren nummers Sybren (44),2 obversers necessary (16),concordance high new (26),observer group necessary (139),low consensus cases (212)
2nd Opinion Required (32),X,0 / 32,0 / 32,1 / 32,0 / 32,1 / 32,1 / 32,1 / 32,0 / 32,1 / 32,1 / 32,0 / 32,0 / 32,0 / 32,0 / 32
duo 28 hans (28),0 / 28,X,28 / 28,9 / 28,0 / 28,17 / 28,0 / 28,0 / 28,0 / 28,0 / 28,0 / 28,14 / 28,0 / 28,0 / 28,2 / 28
duo 28 sybren (28),0 / 28,28 / 28,X,9 / 28,0 / 28,17 / 28,0 / 28,0 / 28,0 / 28,0 / 28,0 / 28,14 / 28,0 / 28,0 / 28,2 / 28
pathxl (161),1 / 161,9 / 161,9 / 161,X,0 / 161,0 / 161,1 / 161,1 / 161,0 / 161,1 / 161,1 / 161,6 / 161,0 / 161,116 / 161,125 / 161
P53 niet duidelijk (4),0 / 4,0 / 4,0 / 4,0 / 4,X,0 / 4,0 / 4,0 / 4,0 / 4,0 / 4,0 / 4,0 / 4,0 / 4,0 / 4,0 / 4
P53 score high consensus (442),1 / 442,17 / 442,17 / 442,0 / 442,0 / 442,X,0 / 442,21 / 442,20 / 442,41 / 442,27 / 442,8 / 442,0 / 442,0 / 442,2 / 442
not ready (20),1 / 20,0 / 20,0 / 20,1 / 20,0 / 20,0 / 20,X,17 / 20,0 / 20,17 / 20,17 / 20,0 / 20,0 / 20,0 / 20,0 / 20
P53 score te lage consensus (38),1 / 38,0 / 38,0 / 38,1 / 38,0 / 38,21 / 38,17 / 38,X,0 / 38,38 / 38,29 / 38,0 / 38,0 / 38,0 / 38,0 / 38
P53 te weinig beoordelingen (20),0 / 20,0 / 20,0 / 20,0 / 20,0 / 20,20 / 20,0 / 20,0 / 20,X,20 / 20,15 / 20,0 / 20,0 / 20,0 / 20,0 / 20
te corrigeren nummers Hans (58),1 / 58,0 / 58,0 / 58,1 / 58,0 / 58,41 / 58,17 / 58,38 / 58,20 / 58,X,44 / 58,0 / 58,0 / 58,0 / 58,0 / 58


In [7]:

high_consensus = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'P53_score_high_consensus')) if f.endswith('.tiff')]
print(len(high_consensus))
pathxl = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'pathxl')) if f.endswith('.tiff')]
print(len(pathxl))

# Find duplicates throughout the folders
from collections import defaultdict
duplicates = defaultdict(list)
folders_with_duplicates = defaultdict(int)
for folder in folders:
    if folder == 'P53_score_high_consensus':
        continue
    for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder)):
        f = f.replace('.tiff','')
        if f in high_consensus:
            duplicates[f].append(folder)
            folders_with_duplicates[folder] += 1
for folder in folders_with_duplicates:
    folders_with_duplicates[folder] = f"{folders_with_duplicates[folder]} / {len([f for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder)) if f.endswith('.tiff')])}"

442
161


In [8]:
# Which case number are missing from pathxl
display(set(pathxl_case_nrs) - set(pathxl))
display(set(pathxl) - set(pathxl_case_nrs))

{'RL934_I_p53_sme', 'RL938_II_p53_sme', 'RL940_I_p53_sme', 'RL960_II_p53_sme'}

{'RL-0535-I-P53_1', 'RL-0595-III-P53'}

In [14]:
# Do the same for first dataset case numbers and the high consensus folder
display(set(first_dataset_case_nrs) - set(high_consensus))
display(set(high_consensus) - set(first_dataset_case_nrs))

set()

{'RBET18-01089_P53-III_BIG',
 'RBET18-02039_P53-II_BIG',
 'RBET18-02039_P53-I_BIG',
 'RBET18-02110_P53-IX_BIG',
 'RBET18-02110_P53-VII_BIG',
 'RBET18-02110_P53-X_BIG',
 'RBET18-02313_P53-II_BIG',
 'RBET18-02323_P53-II_BIG',
 'RBET18-02323_P53-I_BIG',
 'RBET18-02665_P53-I_BIG',
 'RBET18-02666_P53-I_BIG',
 'RBET18-02903_P53-I_BIG',
 'RBET18-04030_P53-III_BIG',
 'RBET18-04863_P53-VI_BIG',
 'RBET18-06938_P53-XI_BIG',
 'RBET18-50101_P53-I_BIG',
 'RBET18-50136_P53-I_BIG',
 'RBET18-50151_P53-II_BIG',
 'RBET18-50151_P53-I_BIG',
 'RL052_01_01_01_P53',
 'RL094_01_02_01_P53',
 'RL118_01_01_01_P53_2',
 'RL177_01_02_01_P53'}

In [18]:
hans_28 = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_hans')) if f.endswith('.tiff')]
sybren_28 = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_sybren')) if f.endswith('.tiff')]

# Check these two folders in the same way
display(set(hans_28) - set(sybren_28))
display(set(sybren_28) - set(hans_28))

# Check hans_28 against high consensus and pathxl in the same way
display(len(set(hans_28) - set(high_consensus)))
display(len(set(hans_28) - set(pathxl)))

set()

set()

11

19

In [23]:
# Test for all XMLs in the duo 28 folders whether they are the same (just as raw files)
hans_28_xmls = [f.replace('.xml','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_hans')) if f.endswith('.xml')]
sybren_28_xmls = [f.replace('.xml','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_sybren')) if f.endswith('.xml')]

for i, xml in enumerate(hans_28_xmls):
    with open(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_hans', f"{xml}.xml"), 'r') as f:
        hans_28_xml = f.read()
    with open(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_sybren', f"{xml}.xml"), 'r') as f:
        sybren_28_xml = f.read()
    if hans_28_xml != sybren_28_xml:
        print(f"XML {xml} is different")
        # Print the differences
        hans_28_xml = hans_28_xml.split('\n')
        sybren_28_xml = sybren_28_xml.split('\n')
        for i, (h, s) in enumerate(zip(hans_28_xml, sybren_28_xml)):
            if h != s:
                print(f"Line {i}:")
                print(f"Hans: {h}")
                print(f"Sybren: {s}")
                print()

XML RL-0024-IV-P53 is different
Line 27:
Hans: <Annotation Color="#000000" Name="Annotation 2" PartOfGroup="WildType" Type="Rectangle">
Sybren: <Annotation Color="#000000" Name="Annotation 2" PartOfGroup="exclude" Type="Rectangle">

XML RL-0103-II-P53 is different
Line 75:
Hans: </Annotations></ASAP_Annotations>
Sybren: <Annotation Color="#000000" Name="Annotation 8" PartOfGroup="roiNullMutation" Type="Polygon">

Line 76:
Hans: 
Sybren: 	<Coordinates>

XML RL-0130-IV-P53 is different
Line 13:
Hans: 		<Coordinate Order="0" X="133324" Y="586.781"/>
Sybren: 		<Coordinate Order="0" X="123628" Y="17450.8"/>

Line 14:
Hans: 		<Coordinate Order="1" X="124172" Y="586.781"/>
Sybren: 		<Coordinate Order="1" X="123628" Y="122.781"/>

Line 15:
Hans: 		<Coordinate Order="2" X="124172" Y="17450.8"/>
Sybren: 		<Coordinate Order="2" X="133324" Y="122.781"/>

Line 37:
Hans: 		<Coordinate Order="0" X="38340.8" Y="49421.8"/>
Sybren: 		<Coordinate Order="0" X="36664.3" Y="50702.1"/>

Line 38:
Hans: 		<Coo

In [13]:
# DOESN'T WORK, FUCKING LIBVIPS
# # Add vips library to path (C:\Users\lverheijen\AppData\Local\miniconda3\envs\wsi\Lib\site-packages)
# import sys
# # sys.path.append(r'H:\libvips-8.15.2')
# sys.path.append(r"H:\vips-dev-8.15\bin")
# os.add_dll_directory(r"H:\vips-dev-8.15\bin")

# dll_name = 'libvips-42.dll'
# print(dll_name in os.listdir(r"H:\vips-dev-8.15\bin"))

# import pyvips

# # Function to convert mrxs to tiff
# def mrxs_to_tiff(mrxs_path, tiff_path):
#     slide = pyvips.Image.new_from_file(mrxs_path, level=0)
#     slide.tiffsave(tiff_path, tile=True, compression='lzw', pyramid=True)
#     print(f"Converted {mrxs_path} to {tiff_path}")

# # Convert all mrxs to tiff in pathxl folder
# pathxl_folder = os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'pathxl')
# for f in os.listdir(pathxl_folder):
#     if f.endswith('.mrxs'):
#         mrxs_to_tiff(os.path.join(pathxl_folder, f), os.path.join(pathxl_folder, f.replace('.mrxs','.tiff')))
# print('Done')

True


OSError: cannot load library 'libvips-42.dll': error 0x7e.  Additionally, ctypes.util.find_library() did not manage to locate a library called 'libvips-42.dll'

In [None]:
display(folders_with_duplicates)

In [None]:
print(len(duplicates.keys()))
display(duplicates)