In [1]:
from wholeslidedata.annotation.wholeslideannotation import WholeSlideAnnotation
from wholeslidedata.image.wholeslideimage import WholeSlideImage
from wholeslidedata.annotation.types import PolygonAnnotation as Polygon
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from tqdm import tqdm

import cv2

from py.helpers import get_outlines, get_area, get_patch, get_sub_areas, patch_empty, concat_one, BARRET_ROOT
import os

os.add_dll_directory(r'C:\Program Files\openslide-win64\bin') # for openslide

LANS_DIR = os.path.join(BARRET_ROOT, 'LANS_001-923')
LANS_BIOP_ROOT = os.path.join(BARRET_ROOT, 'p53_biopsy-level_no-HE_Luuk', '_Luuk')
LANS_BIOP_DIR = os.path.join(LANS_BIOP_ROOT, 'Slidescape_ASAP')
PATHXL_DIR = os.path.join(LANS_BIOP_ROOT, 'PATHXL')
BOLERO_DIR = os.path.join(BARRET_ROOT, 'BOLERO', 'P53 Bolero')

In [2]:
# Load labels for the initial high consensus dataset
first_dataset = pd.read_csv(os.path.join(LANS_BIOP_ROOT, 'used_data_thesis', 'dataset_fullsize', 'biopsies_s1.0_data', 'biopsy_labels_s1.0.csv'))
first_dataset.drop('dir', axis=1, inplace=True)
display(first_dataset.head())
first_dataset_case_nrs = first_dataset['casename'].unique().tolist()
print(first_dataset_case_nrs[:5])
print(len(first_dataset_case_nrs))

# Print row that starts with RL and continues from there
first_dataset.loc[first_dataset['casename'].str.startswith('RL013')]

Unnamed: 0.1,Unnamed: 0,casename,height,width,label,pixels
0,RBE-00101-P53-I_b0,RBE-00101-P53-I,2675,1694,nullmutation,4531450
1,RBE-00101-P53-I_b1,RBE-00101-P53-I,3824,2568,nullmutation,9820032
2,RBE-00102-P53-I_b0,RBE-00102-P53-I,3664,2000,nullmutation,7328000
3,RBE-00102-P53-I_b1,RBE-00102-P53-I,4463,1996,nullmutation,8908148
4,RBE-00102-P53-I_b2,RBE-00102-P53-I,3680,2224,nullmutation,8184320


['RBE-00101-P53-I', 'RBE-00102-P53-I', 'RBE-00103-P53-II', 'RBE-00104-P53-III', 'RBE-00105-P53-II']
419


Unnamed: 0.1,Unnamed: 0,casename,height,width,label,pixels
312,RL013_01_01_01_P53_b0,RL013_01_01_01_P53,2729,2614,wildtype,7133606
313,RL013_01_01_01_P53_b1,RL013_01_01_01_P53,3299,2815,wildtype,9286685
314,RL013_01_01_01_P53_b2,RL013_01_01_01_P53,3517,2282,wildtype,8025794
315,RL013_01_01_01_P53_b3,RL013_01_01_01_P53,3566,3072,wildtype,10954752
316,RL013_01_02_01_P53_b0,RL013_01_02_01_P53,1704,1656,nullmutation,2821824
317,RL013_01_02_01_P53_b1,RL013_01_02_01_P53,2951,3308,wildtype,9761908
318,RL013_01_02_01_P53_b2,RL013_01_02_01_P53,1511,1261,wildtype,1905371
319,RL013_01_02_01_P53_b3,RL013_01_02_01_P53,1560,2725,wildtype,4251000
320,RL013_01_02_01_P53_b4,RL013_01_02_01_P53,1509,1678,nullmutation,2532102
321,RL013_01_03_01_P53_b0,RL013_01_03_01_P53,1547,1744,wildtype,2697968


In [4]:
# See which .tiff files are in the LANS_BIOP_DIR out of the first dataset
first_dataset_files = [f.replace('.tiff','') for f in os.listdir(LANS_BIOP_DIR) if f.endswith('.tiff')]
# Print difference between the two lists
print("initial more than files:", len(set(first_dataset_case_nrs) - set(first_dataset_files))) # initial actually used 30 files from PATHXL
print(set(first_dataset_case_nrs) - set(first_dataset_files))
print("files more than initial:", len(set(first_dataset_files) - set(first_dataset_case_nrs))) # The 19 BIG tiffs got an error
print(set(first_dataset_files) - set(first_dataset_case_nrs))

initial more than files: 30
{'RL-0072-III-P53', 'RL187_01_01_01_P53', 'RL106_01_04_01_P53', 'RL-0103-II-P53', 'RL470_RT20-60498_P53_IV-1', 'RL-0361-IV-P53', 'RL028_01_04_01_P53', 'RL-0167-VII-P53', 'RL-0172-I-P53', 'RL-0226-I-P53', 'RL015_01_03_01_P53', 'RL854_rT22-051208_I1_P53', 'RL162_01_01_01_P53', 'RL465_RT20-50267_P53-I', 'RL-0560-II-P53', 'RL168_01_02_01_P53', 'RL-0668-III-P53', 'RL106_01_03_01_P53', 'RL-0321-III-P53', 'RL-0474-I-P53', 'RL188_01_01_01_P53', 'RL855_RT22-051174_III1s_P53', 'RL-0129-VI-P53', 'RL-0361-I-P53_1', 'RL013_01_02_01_P53', 'RL-0361-III-P53_1', 'RL174_01_02_01_P53', 'RL055_01_02_01_P53', 'RL141_01_01_01_P53', 'RL112_01_04_01_P53'}
files more than initial: 19
{'RBET18-02110_P53-IX_BIG', 'RBET18-50136_P53-I_BIG', 'RBET18-04863_P53-VI_BIG', 'RBET18-02666_P53-I_BIG', 'RBET18-02110_P53-X_BIG', 'RBET18-02039_P53-I_BIG', 'RBET18-50151_P53-I_BIG', 'RBET18-02665_P53-I_BIG', 'RBET18-06938_P53-XI_BIG', 'RBET18-50101_P53-I_BIG', 'RBET18-50151_P53-II_BIG', 'RBET18-02903

In [5]:
# Load labels for pathxl dataset
labels = pd.read_csv(os.path.join(LANS_BIOP_ROOT, '..', 'pathxl_study_slide_labels.csv'))
display(labels.head())
pathxl_case_nrs = labels['case_nr'].to_list()
print(pathxl_case_nrs[:5])
print(len(pathxl_case_nrs))

Unnamed: 0,case_nr,label,nr_biopsies,nr_raters,concordance %,biopsies
0,RL-0013-1-II-P53,Null,5,4,75,"(NM75,WT100,WT100,WT100,WT100)"
1,RL-0027-I-P53,Double clones,4,4,75,"(WT100,WT100,WT100,DC75)"
2,RL-0036-III-P53,Overexpression,2,4,100,"(OE100,OE100)"
3,RL-0042-III-P53_1,Double clones,4,4,75,"(WT75,DC75,OE100,OE100)"
4,RL-0042-IV-P53_1,Overexpression,4,4,75,"(OE75,WT100,WT100,WT100)"


['RL-0013-1-II-P53', 'RL-0027-I-P53', 'RL-0036-III-P53', 'RL-0042-III-P53_1', 'RL-0042-IV-P53_1']
163


In [7]:
# See which .tiff files are in the PATHXL_DIR out of the pathxl dataset
pathxl_files = [f.replace('.tiff','') for f in os.listdir(PATHXL_DIR) if f.endswith('.tiff')]
# Print difference between the two lists
print("pathxl more than files:", len(set(pathxl_case_nrs) - set(pathxl_files))) # There are a few names that are formatted differently
print(set(pathxl_case_nrs) - set(pathxl_files))
print("files more than pathxl:", len(set(pathxl_files) - set(pathxl_case_nrs))) # More files have been added
print(set(pathxl_files) - set(pathxl_case_nrs))

pathxl more than files: 5
{'RL934_I_p53_sme', 'RL960_II_p53_sme', 'RL938_II_p53_sme', 'RL-0345-P53-1', 'RL940_I_p53_sme'}
files more than pathxl: 39
{'RL-0072-III-P53', 'RL187_01_01_01_P53', 'RL-0535-I-P53_2', 'RL106_01_04_01_P53', 'RL-0103-II-P53', 'RL-0053-I-P53', 'RL-0361-IV-P53', 'RL028_01_04_01_P53', 'RL-0167-VII-P53', 'RL-0172-I-P53', 'RL-0226-I-P53', 'RL015_01_03_01_P53', 'RL854_rT22-051208_I1_P53', 'RL162_01_01_01_P53', 'RL465_RT20-50267_P53-I', 'RL-0375-I-P53', 'RL-0560-II-P53', 'RL-0668-III-P53', 'RL168_01_02_01_P53', 'RL-0595-III-P53', 'RL106_01_03_01_P53', 'RL-0321-III-P53', 'RL-0474-I-P53', 'RL118_01_02_01_P53 (2)', 'RL188_01_01_01_P53', 'RL-0909-I-P53', 'RL-0045-I-P53', 'RL-0912-V-P53', 'RL-0129-VI-P53', 'RL-0361-I-P53_1', 'RL855_RT22-051174_III1s_P53', 'RL-0012-II-P53', 'RL-0361-III-P53_1', 'RL-0345-I-P53', 'RL174_01_02_01_P53', 'RL-0918-I-P53', 'RL055_01_02_01_P53', 'RL141_01_01_01_P53', 'RL112_01_04_01_P53'}


In [8]:
# Check the overlap between the two datasets
overlap = set(first_dataset_case_nrs).intersection(set(pathxl_case_nrs))
print(len(overlap))

# It says 0 but case 013 and 470 were spelled differently and are actually overlapping
# Also ~30 cases I used in the first dataset actually ended up on PATHXL later

0


In [13]:
joint_case_nrs = list(set(first_dataset_case_nrs).union(set(pathxl_case_nrs)))

# See which .tiff files are in the joint dataset out of the joint files
joint_files = [f.replace('.xml','') for f in os.listdir(LANS_BIOP_DIR) + os.listdir(PATHXL_DIR) if f.endswith('.xml')]
joint_files_without_BIG = [f for f in joint_files if 'BIG' not in f]
# Print difference between the two lists
print("joint labels more than files:", len(set(joint_case_nrs) - set(joint_files)))
print(set(joint_case_nrs) - set(joint_files))
print("files more than joint labels:", len(set(joint_files) - set(joint_case_nrs)))
print(set(joint_files) - set(joint_case_nrs))
print("files without BIG more than joint labels:", len(set(joint_files_without_BIG) - set(joint_case_nrs)))
print(set(joint_files_without_BIG) - set(joint_case_nrs))

joint labels more than files: 3
{'RL470_RT20-60498_P53_IV-1', 'RL013_01_02_01_P53', 'RL-0345-P53-1'}
files more than joint labels: 36
{'RBET18-02110_P53-IX_BIG', 'rT24-019061_p53_sme', 'RBET18-50136_P53-I_BIG', 'rT24-014159_III_p53_sme', 'RL-0535-I-P53_2', 'RBET18-04863_P53-VI_BIG', 'RBET18-02666_P53-I_BIG', 'RBET18-02110_P53-X_BIG', 'RL-0053-I-P53', 'RBET18-02039_P53-I_BIG', 'rT24-019074_p53_sme', 'RBET18-50151_P53-I_BIG', 'rT24-11302_II_p53_sme', 'RBET18-02665_P53-I_BIG', 'RL-0375-I-P53', 'RBET18-06938_P53-XI_BIG', 'RL-0595-III-P53', 'RBET18-50101_P53-I_BIG', 'RL-0909-I-P53', 'rT24-11302_I_p53_sme', 'RBET18-50151_P53-II_BIG', 'RBET18-02903_P53-I_BIG', 'RBET18-02323_P53-II_BIG', 'RBET18-04030_P53-III_BIG', 'RL-0045-I-P53', 'rT24-018523_p53_sme', 'RBET18-02039_P53-II_BIG', 'RL-0912-V-P53', 'RL-0012-II-P53', 'RBET18-02323_P53-I_BIG', 'rT24-010548_p53_sme', 'RL-0918-I-P53', 'RL-0345-I-P53', 'RBET18-01089_P53-III_BIG', 'RBET18-02110_P53-VII_BIG', 'RBET18-02313_P53-II_BIG'}
files without B

In [12]:
# At LANS_BIOP_ROOT open the file p53_slide+biopsy.xlsx
new_labels = pd.read_csv(os.path.join(LANS_BIOP_ROOT, 'p53_slide+biopsy.csv'), sep=';')
# Remove .xml from the case_nr
new_labels['case_nr'] = new_labels['case_nr'].str.replace('.xml','')
display(new_labels.head())

# See which .tiff files are in the new labels out of the joint files
joint_files = [f.replace('.xml','') for f in os.listdir(LANS_BIOP_DIR) + os.listdir(PATHXL_DIR) if f.endswith('.xml')]
# Print difference between the two lists
print("new more than files:", len(set(new_labels['case_nr']) - set(joint_files)))
print(set(new_labels['case_nr']) - set(joint_files))
# Make txt file from the above
# with open('missing_files.txt', 'w') as f:
#     for item in set(new_labels['case_nr']) - set(joint_files):
#         f.write("%s\n" % item)
print("files more than new:", len(set(joint_files) - set(new_labels['case_nr'])))
print(set(joint_files) - set(new_labels['case_nr']))

Unnamed: 0,case_nr,label,nr_biopsies,nr_raters,concordance %,biopsies,Study
0,RL-0013-1-II-P53,Null,5,4,75,"(NM75,WT100,WT100,WT100,WT100)",
1,RL-0027-I-P53,Double clones,4,4,75,"(WT100,WT100,WT100,DC75)",
2,RL-0036-III-P53,Overexpression,2,4,100,"(OE100,OE100)",
3,RL-0042-III-P53_1,Double clones,4,4,75,"(WT75,DC75,OE100,OE100)",
4,RL-0042-II-P53_1,Null,3,4,50,"(WT100,WT100,NM50)",


new more than files: 0
set()
files more than new: 293
{'RL093_01_03_01_P53 (2)', 'RL061_01_01_01_P53_2', 'RL187_01_03_01_P53', 'RL057_01_01_01_P53', 'RL170_01_01_01_P53', 'RL089_01_02_01_P53', 'RL034_01_01_01_P53_1', 'RL097_01_03_01_P53', 'RL075_01_02_01_P53_1', 'RL062_01_01_01_P53_1', 'RL064_01_02_01_P53', 'RL187_01_05_01_P53', 'RL047_01_06_01_P53', 'RL160_01_03_01_P53', 'RL129_01_04_01_P53', 'RL154_01_01_01_P53', 'RL151_01_01_01_P53', 'RL136_01_01_01_P53', 'RL100_01_01_01_P53_2', 'RL156_01_01_01_P53', 'RL138_01_01_01_P53', 'RL191_01_01_01_P53', 'RL059_01_02_01_P53', 'RL057_01_03_01_P53', 'RL206_T18-60464_AMC_P53_I-1', 'RL-0024-IV-P53', 'RL007_01_02_01_P53', 'RL029_01_07_01_P53', 'RL033_01_03_01_P53', 'RL029_01_04_01_P53', 'RL134_01_02_01_P53', 'RL762_T22-050282_II1_P53', 'RL178_01_04_01_P53', 'RL134_01_03_01_P53', 'RL117_01_01_01_P53', 'RL175_01_02_01_P53', 'RL050_01_02_01_P53_2', 'RL033_01_02_01_P53', 'RL089_01_01_01_P53_1', 'RL148_01_01_01_P53', 'RL143_01_01_01_P53', 'RL006_01_01_0

# Messy Folders:

In [None]:
# folders = os.listdir(LANS_BIOP_ROOT)
# folders = [f for f in folders if os.path.isdir(os.path.join(LANS_BIOP_ROOT, f))]
folders = [
    "2nd Opinion Required",
    os.path.join("final_test_set_(low_consensus_second_look)", "not_ready", "duo_28_hans"),
    os.path.join("final_test_set_(low_consensus_second_look)", "not_ready", "duo_28_sybren"),
    os.path.join("final_test_set_(low_consensus_second_look)", "not_ready", "pathxl"),
    # "p53 dubbel ISYNTAX", # Only isyntax
    "P53_niet_duidelijk",
    "P53_score_high_consensus",
    os.path.join("P53_score_high_consensus", "not ready"),
    "P53_score_te_lage_consensus",
    "P53_te_weinig_beoordelingen",
    "te corrigeren_nummers Hans",
    "te corrigeren_nummers Sybren",
    # "Uitzoeken onno ASL", # Only isyntax
    os.path.join("voor Onno", "2 obversers necessary"),
    os.path.join("voor Onno", "concordance_high_new"),
    os.path.join("voor Onno", "observer group necessary"),
    # Upper folders
    os.path.join("..", "low consensus cases")
]

In [None]:
# Make a matrix of folders x folders with the number of cases in common like the string "common / total"
common_cases = np.zeros((len(folders), len(folders)), dtype=object)
tiffs_per_folder = {}
for i, folder1 in enumerate(folders):
    folder1_cases = [f for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder1)) if f.endswith('.tiff') or f.endswith('.isyntax')]
    tiffs_per_folder[folder1] = len(folder1_cases)
    for j, folder2 in enumerate(folders):
        if i == j:
            common_cases[i, j] = 'X'
            continue
        folder2_cases = [f for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder2)) if f.endswith('.tiff') or f.endswith('.isyntax')]
        common_cases[i, j] = f"{len(set(folder1_cases) & set(folder2_cases))} / {len(folder1_cases)}"
folders_display = [f.split(os.sep)[-1].replace('_',' ')+f" ({tiffs_per_folder[f]})" for f in folders]
common_cases_df = pd.DataFrame(common_cases, index=folders_display, columns=folders_display)
display(common_cases_df)

In [None]:

high_consensus = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'P53_score_high_consensus')) if f.endswith('.tiff')]
print(len(high_consensus))
pathxl = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'pathxl')) if f.endswith('.tiff')]
print(len(pathxl))

# Find duplicates throughout the folders
from collections import defaultdict
duplicates = defaultdict(list)
folders_with_duplicates = defaultdict(int)
for folder in folders:
    if folder == 'P53_score_high_consensus':
        continue
    for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder)):
        f = f.replace('.tiff','')
        if f in high_consensus:
            duplicates[f].append(folder)
            folders_with_duplicates[folder] += 1
for folder in folders_with_duplicates:
    folders_with_duplicates[folder] = f"{folders_with_duplicates[folder]} / {len([f for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder)) if f.endswith('.tiff')])}"

In [None]:
# Which case number are missing from pathxl
display(set(pathxl_case_nrs) - set(pathxl))
display(set(pathxl) - set(pathxl_case_nrs))

In [None]:
# Do the same for first dataset case numbers and the high consensus folder
display(set(first_dataset_case_nrs) - set(high_consensus))
display(set(high_consensus) - set(first_dataset_case_nrs))

In [None]:
hans_28 = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_hans')) if f.endswith('.tiff')]
sybren_28 = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_sybren')) if f.endswith('.tiff')]

# Check these two folders in the same way
display(set(hans_28) - set(sybren_28))
display(set(sybren_28) - set(hans_28))

# Check hans_28 against high consensus and pathxl in the same way
display(len(set(hans_28) - set(high_consensus)))
display(len(set(hans_28) - set(pathxl)))

In [None]:
# Test for all XMLs in the duo 28 folders whether they are the same (just as raw files)
hans_28_xmls = [f.replace('.xml','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_hans')) if f.endswith('.xml')]
sybren_28_xmls = [f.replace('.xml','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_sybren')) if f.endswith('.xml')]

for i, xml in enumerate(hans_28_xmls):
    with open(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_hans', f"{xml}.xml"), 'r') as f:
        hans_28_xml = f.read()
    with open(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_sybren', f"{xml}.xml"), 'r') as f:
        sybren_28_xml = f.read()
    if hans_28_xml != sybren_28_xml:
        print(f"XML {xml} is different")
        # Print the differences
        hans_28_xml = hans_28_xml.split('\n')
        sybren_28_xml = sybren_28_xml.split('\n')
        for i, (h, s) in enumerate(zip(hans_28_xml, sybren_28_xml)):
            if h != s:
                print(f"Line {i}:")
                print(f"Hans: {h}")
                print(f"Sybren: {s}")
                print()

In [None]:
# DOESN'T WORK, FUCKING LIBVIPS
# # Add vips library to path (C:\Users\lverheijen\AppData\Local\miniconda3\envs\wsi\Lib\site-packages)
# import sys
# # sys.path.append(r'H:\libvips-8.15.2')
# sys.path.append(r"H:\vips-dev-8.15\bin")
# os.add_dll_directory(r"H:\vips-dev-8.15\bin")

# dll_name = 'libvips-42.dll'
# print(dll_name in os.listdir(r"H:\vips-dev-8.15\bin"))

# import pyvips

# # Function to convert mrxs to tiff
# def mrxs_to_tiff(mrxs_path, tiff_path):
#     slide = pyvips.Image.new_from_file(mrxs_path, level=0)
#     slide.tiffsave(tiff_path, tile=True, compression='lzw', pyramid=True)
#     print(f"Converted {mrxs_path} to {tiff_path}")

# # Convert all mrxs to tiff in pathxl folder
# pathxl_folder = os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'pathxl')
# for f in os.listdir(pathxl_folder):
#     if f.endswith('.mrxs'):
#         mrxs_to_tiff(os.path.join(pathxl_folder, f), os.path.join(pathxl_folder, f.replace('.mrxs','.tiff')))
# print('Done')

In [None]:
display(folders_with_duplicates)

In [None]:
print(len(duplicates.keys()))
display(duplicates)