In [None]:
from wholeslidedata.annotation.wholeslideannotation import WholeSlideAnnotation
from wholeslidedata.image.wholeslideimage import WholeSlideImage
from wholeslidedata.annotation.types import PolygonAnnotation as Polygon
from matplotlib import pyplot as plt
import numpy as np
import os
import pandas as pd
from tqdm import tqdm

import cv2

from py.helpers import get_outlines, get_area, get_patch, get_sub_areas, patch_empty, concat_one, BARRET_ROOT
import os

os.add_dll_directory(r'C:\Program Files\openslide-win64\bin') # for openslide

LANS_DIR = os.path.join(BARRET_ROOT, 'LANS_001-923')
LANS_BIOP_ROOT = os.path.join(BARRET_ROOT, 'p53_biopsy-level_no-HE_Luuk', '_Luuk')
LANS_BIOP_DIR = os.path.join(LANS_BIOP_ROOT, 'Slidescape_ASAP')
BOLERO_DIR = os.path.join(BARRET_ROOT, 'BOLERO', 'P53 Bolero')
PATHXL_DIR = os.path.join(LANS_BIOP_ROOT, 'PATHXL cases')
PATHXL_DIR2 = os.path.join(LANS_BIOP_ROOT, 'PATHXL slides')

In [None]:
# Load labels for the initial high consensus dataset
first_dataset = pd.read_csv(os.path.join(LANS_BIOP_ROOT, 'used_data_thesis', 'dataset_fullsize', 'biopsies_s1.0_data', 'biopsy_labels_s1.0.csv'))
first_dataset.drop('dir', axis=1, inplace=True)
display(first_dataset.head())
first_dataset_case_nrs = first_dataset['casename'].unique().tolist()
print(first_dataset_case_nrs[:5])
print(len(first_dataset_case_nrs))

In [None]:
# See which .tiff files are in the LANS_BIOP_DIR out of the first dataset
first_dataset_files = [f.replace('.tiff','') for f in os.listdir(LANS_BIOP_DIR) if f.endswith('.tiff')]
# Print difference between the two lists
print("initial more than files:", len(set(first_dataset_case_nrs) - set(first_dataset_files)))
print(set(first_dataset_case_nrs) - set(first_dataset_files))
print("files more than initial:", len(set(first_dataset_files) - set(first_dataset_case_nrs)))
print(set(first_dataset_files) - set(first_dataset_case_nrs))

In [None]:
# Load labels for pathxl dataset
labels = pd.read_csv(os.path.join(LANS_BIOP_ROOT, '..', 'pathxl_study_slide_labels.csv'))
display(labels.head())
pathxl_case_nrs = labels['case_nr'].to_list()
print(pathxl_case_nrs[:5])
print(len(pathxl_case_nrs))

In [None]:
# See which .tiff files are in the PATHXL_DIR out of the pathxl dataset
pathxl_files = [f.replace('.tiff','') for f in os.listdir(PATHXL_DIR) if f.endswith('.tiff')]
# Print difference between the two lists
print("pathxl more than files:", len(set(pathxl_case_nrs) - set(pathxl_files)))
print(set(pathxl_case_nrs) - set(pathxl_files))
print("files more than pathxl:", len(set(pathxl_files) - set(pathxl_case_nrs)))
print(set(pathxl_files) - set(pathxl_case_nrs))

In [None]:
# See which .tiff files are in the PATHXL_DIR2 out of the first dataset
pathxl_files2 = [f.replace('.tiff','') for f in os.listdir(PATHXL_DIR2) if f.endswith('.tiff')]
# Print intersection between the two lists
print("intersection:", len(set(pathxl_files2) & set(first_dataset_case_nrs)))
print(set(pathxl_files2) & set(first_dataset_case_nrs))
print("pathxl2 more than files:", len(set(pathxl_files2) - set(first_dataset_case_nrs)))
print(set(pathxl_files2) - set(first_dataset_case_nrs))

# See which .tiff files are in the PATHXL_DIR2 out of the pathxl dataset
print("intersection:", len(set(pathxl_files2) & set(pathxl_case_nrs)))
print(set(pathxl_files2) & set(pathxl_case_nrs))
print("pathxl2 more than files:", len(set(pathxl_files2) - set(pathxl_case_nrs)))
print(set(pathxl_files2) - set(pathxl_case_nrs))

In [None]:
# Check the overlap between the two datasets
overlap = set(first_dataset_case_nrs).intersection(set(pathxl_case_nrs))
print(len(overlap))

# It says 0 but case 013 and 470 were spelled differently and are actually overlapping
# Also ~30 cases I used in the first dataset actually ended up on PATHXL later

In [None]:
# Get row for case_nr RL-0345-P53-1
case_nr = 'RL-0345-P53-1'
case_row = labels[labels['case_nr'] == case_nr]
display(case_row)

# Messy Folders:

In [None]:
# folders = os.listdir(LANS_BIOP_ROOT)
# folders = [f for f in folders if os.path.isdir(os.path.join(LANS_BIOP_ROOT, f))]
folders = [
    "2nd Opinion Required",
    os.path.join("final_test_set_(low_consensus_second_look)", "not_ready", "duo_28_hans"),
    os.path.join("final_test_set_(low_consensus_second_look)", "not_ready", "duo_28_sybren"),
    os.path.join("final_test_set_(low_consensus_second_look)", "not_ready", "pathxl"),
    # "p53 dubbel ISYNTAX", # Only isyntax
    "P53_niet_duidelijk",
    "P53_score_high_consensus",
    os.path.join("P53_score_high_consensus", "not ready"),
    "P53_score_te_lage_consensus",
    "P53_te_weinig_beoordelingen",
    "te corrigeren_nummers Hans",
    "te corrigeren_nummers Sybren",
    # "Uitzoeken onno ASL", # Only isyntax
    os.path.join("voor Onno", "2 obversers necessary"),
    os.path.join("voor Onno", "concordance_high_new"),
    os.path.join("voor Onno", "observer group necessary"),
    # Upper folders
    os.path.join("..", "low consensus cases")
]

In [None]:
# Make a matrix of folders x folders with the number of cases in common like the string "common / total"
common_cases = np.zeros((len(folders), len(folders)), dtype=object)
tiffs_per_folder = {}
for i, folder1 in enumerate(folders):
    folder1_cases = [f for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder1)) if f.endswith('.tiff') or f.endswith('.isyntax')]
    tiffs_per_folder[folder1] = len(folder1_cases)
    for j, folder2 in enumerate(folders):
        if i == j:
            common_cases[i, j] = 'X'
            continue
        folder2_cases = [f for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder2)) if f.endswith('.tiff') or f.endswith('.isyntax')]
        common_cases[i, j] = f"{len(set(folder1_cases) & set(folder2_cases))} / {len(folder1_cases)}"
folders_display = [f.split(os.sep)[-1].replace('_',' ')+f" ({tiffs_per_folder[f]})" for f in folders]
common_cases_df = pd.DataFrame(common_cases, index=folders_display, columns=folders_display)
display(common_cases_df)

In [None]:

high_consensus = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'P53_score_high_consensus')) if f.endswith('.tiff')]
print(len(high_consensus))
pathxl = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'pathxl')) if f.endswith('.tiff')]
print(len(pathxl))

# Find duplicates throughout the folders
from collections import defaultdict
duplicates = defaultdict(list)
folders_with_duplicates = defaultdict(int)
for folder in folders:
    if folder == 'P53_score_high_consensus':
        continue
    for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder)):
        f = f.replace('.tiff','')
        if f in high_consensus:
            duplicates[f].append(folder)
            folders_with_duplicates[folder] += 1
for folder in folders_with_duplicates:
    folders_with_duplicates[folder] = f"{folders_with_duplicates[folder]} / {len([f for f in os.listdir(os.path.join(LANS_BIOP_ROOT, folder)) if f.endswith('.tiff')])}"

In [None]:
# Which case number are missing from pathxl
display(set(pathxl_case_nrs) - set(pathxl))
display(set(pathxl) - set(pathxl_case_nrs))

In [None]:
# Do the same for first dataset case numbers and the high consensus folder
display(set(first_dataset_case_nrs) - set(high_consensus))
display(set(high_consensus) - set(first_dataset_case_nrs))

In [None]:
hans_28 = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_hans')) if f.endswith('.tiff')]
sybren_28 = [f.replace('.tiff','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_sybren')) if f.endswith('.tiff')]

# Check these two folders in the same way
display(set(hans_28) - set(sybren_28))
display(set(sybren_28) - set(hans_28))

# Check hans_28 against high consensus and pathxl in the same way
display(len(set(hans_28) - set(high_consensus)))
display(len(set(hans_28) - set(pathxl)))

In [None]:
# Test for all XMLs in the duo 28 folders whether they are the same (just as raw files)
hans_28_xmls = [f.replace('.xml','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_hans')) if f.endswith('.xml')]
sybren_28_xmls = [f.replace('.xml','') for f in os.listdir(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_sybren')) if f.endswith('.xml')]

for i, xml in enumerate(hans_28_xmls):
    with open(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_hans', f"{xml}.xml"), 'r') as f:
        hans_28_xml = f.read()
    with open(os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'duo_28_sybren', f"{xml}.xml"), 'r') as f:
        sybren_28_xml = f.read()
    if hans_28_xml != sybren_28_xml:
        print(f"XML {xml} is different")
        # Print the differences
        hans_28_xml = hans_28_xml.split('\n')
        sybren_28_xml = sybren_28_xml.split('\n')
        for i, (h, s) in enumerate(zip(hans_28_xml, sybren_28_xml)):
            if h != s:
                print(f"Line {i}:")
                print(f"Hans: {h}")
                print(f"Sybren: {s}")
                print()

In [None]:
# DOESN'T WORK, FUCKING LIBVIPS
# # Add vips library to path (C:\Users\lverheijen\AppData\Local\miniconda3\envs\wsi\Lib\site-packages)
# import sys
# # sys.path.append(r'H:\libvips-8.15.2')
# sys.path.append(r"H:\vips-dev-8.15\bin")
# os.add_dll_directory(r"H:\vips-dev-8.15\bin")

# dll_name = 'libvips-42.dll'
# print(dll_name in os.listdir(r"H:\vips-dev-8.15\bin"))

# import pyvips

# # Function to convert mrxs to tiff
# def mrxs_to_tiff(mrxs_path, tiff_path):
#     slide = pyvips.Image.new_from_file(mrxs_path, level=0)
#     slide.tiffsave(tiff_path, tile=True, compression='lzw', pyramid=True)
#     print(f"Converted {mrxs_path} to {tiff_path}")

# # Convert all mrxs to tiff in pathxl folder
# pathxl_folder = os.path.join(LANS_BIOP_ROOT, 'final_test_set_(low_consensus_second_look)', 'not_ready', 'pathxl')
# for f in os.listdir(pathxl_folder):
#     if f.endswith('.mrxs'):
#         mrxs_to_tiff(os.path.join(pathxl_folder, f), os.path.join(pathxl_folder, f.replace('.mrxs','.tiff')))
# print('Done')

In [None]:
display(folders_with_duplicates)

In [None]:
print(len(duplicates.keys()))
display(duplicates)