# 1) We start by importing the modules

In [1]:
import os
import numpy as np
import pandas as pd
import prismtoolbox as ptb
from pathlib import Path
from tqdm import tqdm
import openslide
import cv2
import random
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mplpatches
from skimage.util import view_as_windows
import sys
sys.path.append('../')
from utils.utils import wpx_ratio, bpx_ratio

# 2) We define the location of raw images

In [None]:
raw_data_folder = "../data/raw/images"

# 3) We import csv data

In [None]:
lesions_df = pd.read_csv("../data/raw/presence_of_lesion.csv")
lesions_df

In [None]:
train_df = pd.read_csv("../data/raw/train.csv")
train_df

# 4) We identify slides with no lesions and no bounding boxes

In [None]:
train_slides = [f"{file.split('_')[0]}.tif" for file in train_df.filename]
negative_slides = lesions_df[lesions_df.presence_of_lesion==0&~lesions_df.file_name.isin(train_slides)].file_name.values
negative_slides

### Due to some problems in our code, not all these slides were considered and some slides with a lesions were considered a negatives. To reproduce exactly the results of the submission, please consider the following slides as negatives:

In [None]:
df = pd.read_csv("../data/processed/slides_considered_as_negative.csv")
negative_slides = np.unique([f"{file.split('_')[0]}.tif" for file in df.filename])
negative_slides

# 5) We extract patches from training data with no lesions that will be labelled as negatives

In [None]:
directory_contours_train =  "../data/processed/train_negatives/contours"
directory_visualize_train = "../data/processed/train_negatives/contoured_images"
directory_patches_train = "../data/processed/train_negatives/patches"
directory_patches_as_jpg_train = "../data/processed/train_negatives/patches_as_jpg_1000"
directory_stitch_train = "../data/processed/train_negatives/stitch_images"

Path(directory_contours_train).mkdir(parents=True, exist_ok=True)
Path(directory_visualize_train).mkdir(parents=True, exist_ok=True)
Path(directory_patches_train).mkdir(parents=True, exist_ok=True)
Path(directory_patches_as_jpg_train).mkdir(parents=True, exist_ok=True)
Path(directory_stitch_train).mkdir(parents=True, exist_ok=True)

### To speed up processing, we selected randomly 1000 patches to save from each slide. The slides were contoured and patches were curated with high threshold to discard as much artifacts as possible.

In [None]:
rng = np.random.default_rng(1)

In [None]:
for file in tqdm(os.listdir(raw_data_folder)):
    filename = file.split(".tif")[0]
    if os.path.exists(os.path.join(directory_stitch_train, f"{filename}.jpg")) or not f"{filename.split('_')[0]}.tif" in negative_slides:
        continue
    WSI_object = ptb.WSI(slide_path=os.path.join(raw_data_folder, file), engine="openslide")
    # Extract tissue contours
    params_detect_tissue = {"seg_level": 6, "window_avg": 30, "window_eng": 3, "thresh": 110, "area_min": 1.5e3}

    WSI_object.detect_tissue(**params_detect_tissue)

    WSI_object.save_tissue_contours(directory_contours_train)

    img = WSI_object.visualize(vis_level=6, number_contours=True)

    img.save(os.path.join(directory_visualize_train, f"{filename}.jpg"))

    params_patches = {"patch_size": 512, "patch_level": 0, "step_size": 512, "mode": "contours", "contours_mode": "four_pt_hard",
                      "rgb_threshs":(2, 180), "percentages": (0.6, 0.7)}

    WSI_object.extract_patches(**params_patches)
    #WSI_object.save_patches(directory_patches_train)
    
    selected_idx = rng.choice(len(WSI_object.coords), 1000)
    WSI_object.save_patches(os.path.join(directory_patches_as_jpg_train, file), selected_idx=selected_idx, file_format="jpg")

    img = WSI_object.stitch(vis_level=6)

    img.save(os.path.join(directory_stitch_train, f"{filename}.jpg"))

# 6) We extract patches from all validation slides

In [None]:
directory_contours_val =  "../data/processed/val/contours"
directory_visualize_val = "../dataprocessed/val/contoured_images"
directory_patches_val = "../data/processed/val/patches"
directory_patches_as_jpg_val = "../data/processed/val/patches_as_jpg_full"
directory_stitch_val = "../data/processed/val/stitch_images"

Path(directory_contours_val).mkdir(parents=True, exist_ok=True)
Path(directory_visualize_val).mkdir(parents=True, exist_ok=True)
Path(directory_patches_val).mkdir(parents=True, exist_ok=True)
Path(directory_patches_as_jpg_val).mkdir(parents=True, exist_ok=True)
Path(directory_stitch_val).mkdir(parents=True, exist_ok=True)

In [None]:
val_df = pd.read_csv("../data/raw/validation.csv")
val_df

### The threshold were changed to ensure we retrieved as much as possible all the tissue with minimum artifacts

In [None]:
for file in tqdm(os.listdir(raw_data_folder)):
    if file not in val_df.filename.values or os.path.exists(os.path.join(directory_stitch_val, f"{filename}.jpg")):
        continue
    WSI_object = ptb.WSI(slide_path=os.path.join(raw_data_folder, file), engine="openslide")
    # Extract tissue contours
    params_detect_tissue = {"seg_level": 5, "window_avg": 30, "window_eng": 5, "thresh": 180, "area_min": 6e3}

    WSI_object.detect_tissue(**params_detect_tissue)

    WSI_object.save_tissue_contours(directory_contours_val)

    img = WSI_object.visualize(vis_level=6, number_contours=True)

    img.save(os.path.join(directory_visualize_val, file.replace(".tif", ".jpg")))

    params_patches = {"patch_size": 512, "patch_level": 0, "step_size": 512, "mode": "contours", "contours_mode": "four_pt_hard",
                      "rgb_threshs":(2, 240), "percentages": (0.6, 0.7)}

    WSI_object.extract_patches(**params_patches)

    WSI_object.save_patches(directory_patches_val)

    WSI_object.save_patches(os.path.join(directory_patches_as_jpg_val, file), file_format="jpg")

    img = WSI_object.stitch(vis_level=6)

    img.save(os.path.join(directory_stitch, file.replace(".tif", ".jpg")))

# 7) We extract positive patches

In [None]:
white_perc, black_perc = 0.9, 0.1 #threshholds for the % of white and black pixels in a patch read from the slide
vis_scale = 0.008
patch_size =(512, 512)

seed=2024
random.seed(seed)
np.random.seed(seed)

In [None]:
df_train = pd.read_csv('../data/raw/train.csv')
df_valid = pd.read_csv('../data/raw/validation.csv')
df_lesions = pd.read_csv('../data/raw/presence_of_lesion.csv')
print(df_train.shape, df_valid.shape, df_lesions.shape)

df_train['patient'] = df_train['filename'].apply(lambda x: x.split('_')[0])
df_train['width']=df_train['x2']-df_train['x1']
df_train['height']=df_train['y2']-df_train['y1']
df_valid['patient'] = df_valid['filename'].apply(lambda x: x.split('_')[0])
df_lesions['patient'] = df_lesions['file_name'].apply(lambda x: x.split('.')[0])
all_patients = len(df_train["patient"].unique())
df_wrong = pd.DataFrame(columns=["patient", "filename", "x1", "x2", "y1", "y2"], index=range(len(os.listdir('../data/raw/Boundig Box IDs'))))

for i,e in enumerate([e for e in os.listdir('../data/raw/Boundig Box IDs') if '.jpeg' in e]):
    try:
        p, l, x1,x2, y1, y2 = e.split('_')
        df_wrong.iloc[i] = [p, p+'_'+l+'.tif', x1,x2, y1, y2.split('.')[0]]
    except Exception as exp: 
        print(e, e.split('_'), exp)
df_wrong = df_wrong.dropna()
display(df_wrong)
df_wrong.to_csv('../data/raw/wrong_bbox.csv', index=False)
print('raw data:')
print("train patients", len(df_train['patient'].unique()), "train slides",len(df_train['filename'].unique()),)
print("valid patients", len(df_valid['patient'].unique()), "valid slides", len(df_valid['filename'].unique()))

indices = []
for p in df_wrong.filename.unique():
    for x1,x2, y1, y2 in df_wrong.loc[df_wrong["filename"]==p][["x1", "x2", "y1", "y2"]].values:
        selected_row = df_train.loc[(df_train["filename"]==p) & (df_train["x1"]==int(x1))]
        if len(selected_row) > 0:
            indices.append(selected_row.index.item())
df_train = df_train.drop(indices)

print("\neliminate wrong Bboxes:")
print("train patients", len(df_train['patient'].unique()), "train slides",len(df_train['filename'].unique()),)
print("valid patients", len(df_valid['patient'].unique()), "valid slides", len(df_valid['filename'].unique()))
intersection_tr_vl = set(df_train['patient'].unique())&set(df_valid['patient'].unique())

print("\nintersection between train and validation:", len(intersection_tr_vl))
print("train no label in presence_of_lesion:",set(df_train['patient'].unique())-set(df_lesions['patient'].unique()))
print("valid no label in presence_of_lesion:",set(df_valid['patient'].unique())-set(df_lesions['patient'].unique()))

In [None]:
train_cols = ['patient','filename', 'x1', 'x2', 'y1', 'y2', 'max_x', 'max_y', "width", "height", 'presence_of_lesion']
df_train = pd.merge(df_train, df_lesions, on=["patient"])
df_train = df_train[train_cols]
df_test = df_train.loc[df_train['patient'].isin(intersection_tr_vl)]
df_train = df_train.loc[~df_train['patient'].isin(intersection_tr_vl)]

print('\nafter merging with presence_of_lesion:')
print("train patients", len(df_train['patient'].unique()), len(df_train['patient'].unique())/all_patients, "train slides",len(df_train['filename'].unique()),)
print("test patients", len(df_test['patient'].unique()), len(df_test['patient'].unique())/all_patients,  "test slides",len(df_test['filename'].unique()),)

all_slides = os.listdir("../data/raw/images")
train_slides = df_train['filename'].unique().tolist()
test_slides = df_test['filename'].unique().tolist()
valid_slides = df_valid['filename'].unique().tolist()
annotated_slides = train_slides + test_slides + valid_slides

print('\ndata slides:', len(all_slides))
print('train slides:', len(train_slides))
print('test slides:', len(test_slides))
print('valid slides:', len(valid_slides))
print('annotated slides:', len(annotated_slides))

In [None]:
data = pd.concat([df_train, df_test], axis=0)
data = data.loc[data['presence_of_lesion']==1]
data

In [None]:
Path("../data/processed/positive_patches_bis").mkdir(parents=True, exist_ok=True)
non_valid_slides = []
removed_patches = []
bw_vals = []
s=0
for idx in tqdm(range(len(data['filename'].unique()))):
    slide_name = data['filename'].unique()[idx]
    directory_positive_patches = "../data/processed/positive_patches_bis"+'/'+slide_name.split('.')[0]
    Path(directory_positive_patches).mkdir(parents=True, exist_ok=True)

    slide = openslide.OpenSlide('../data/raw/images/'+slide_name)
    cols = len(data.loc[data['filename']==slide_name])

    images = []
    for i in range(cols):
        x1 = data.loc[data['filename']==slide_name].iloc[i]['x1']
        x2 = data.loc[data['filename']==slide_name].iloc[i]['x2']
        y1 = data.loc[data['filename']==slide_name].iloc[i]['y1']
        y2 = data.loc[data['filename']==slide_name].iloc[i]['y2']
        image = slide.read_region((x1,y1), level=0, size=(x2-x1, y2-y1)).convert('RGB')
        image = np.array(image)
        try:
            if image.shape[0]>=patch_size[0] and image.shape[1]>=patch_size[1]:
                image_height, image_width, _ = image.shape
                new_height = ((image_height + patch_size[0] - 1) // patch_size[0]) * patch_size[0]
                new_width = ((image_width + patch_size[1] - 1) // patch_size[1]) * patch_size[1]
                image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LANCZOS4)
                patches = view_as_windows(image, (patch_size[0], patch_size[1], 3), step=patch_size[1])
                patches = view_as_windows(image, (patch_size[0], patch_size[1], 3), step=patch_size[1])
                for i in range(patches.shape[0]):
                    for j in range(patches.shape[1]):
                        w_val = wpx_ratio(patches[i, j, 0])
                        b_val = bpx_ratio(patches[i, j, 0])
                        if w_val<white_perc  and b_val<black_perc:
                            plt.imsave(f'{directory_positive_patches}/{slide_name.split(".")[0]}_x1_{x1}_x2_{x2}_y1_{y1}_y2_{y2}_i_{patch_size[0]*i}_j_{patch_size[1]*j}.jpg', arr=patches[i, j, 0])
                            s+=1
                        else:
                            w_val = wpx_ratio(patches[i, j, 0])
                            removed_patches.append(patches[i, j, 0])
                            bw_vals.append(w_val)
            else:
                image = cv2.resize(image, (patch_size[0], patch_size[1]), interpolation=cv2.INTER_LANCZOS4)
                plt.imsave(f'{directory_positive_patches}/{slide_name.split(".")[0]}_x1_{x1}_x2_{x2}_y1_{y1}_y2_{y2}_resized.jpg', arr=image)
                s+=1
        except Exception as e:
            non_valid_slides.append(slide_name)
            print(slide_name, e)

# 8) Create training dataframes for classification model without cross-validation

In [None]:
cols = ["patient", "filename", "x1", "y1", "x2", "y2", "i", "j", "image", "label", "path"]
pos_df = pd.DataFrame(columns=cols, index=range(0, 10**4))
k=0
for slide in tqdm(os.listdir('../data/processed/positive_patches')):
    for patch in os.listdir(f'../data/processed/positive_patches/{slide}'):
        if '.ipynb' not in patch:
            l = patch.split('_')
            image, label = patch, 1
            p, s, x1, y1, x2, y2 = l[0], l[0]+"_"+l[1], l[3], l[5], l[7], l[9]
            path = f'../data/processed/positive_patches/{slide}/'
            if "resized" in patch:
                i, j =  -1, -1
            else:
                i, j = l[11], l[13].split('.')[0]
            pos_df.loc[k] = [p, s+'.tif', x1, y1, x2, y2, i, j, image, label, path]
            k+=1
pos_df = pos_df.dropna()
pos_df

In [None]:
thresh = 1000
cols = ["patient", "filename", "x1", "y1", "x2", "y2", "image", "label", "path"]
neg_df = pd.DataFrame(columns=cols, index=range(0, len(os.listdir("../data/processed/train_negatives/patches_as_jpg_1000/"))*thresh))

k=0
for slide in tqdm(os.listdir("../data/processed/train_negatives/patches_as_jpg_1000/")):
    if len(os.listdir("../data/processed/train_negatives/patches_as_jpg_1000/"+slide))<thresh:
        liste = os.listdir("../data/processed/train_negatives/patches_as_jpg_1000/"+slide)
    else:
        liste = random.sample(os.listdir("../data/processed/train_negatives/patches_as_jpg_1000/"+slide), k=thresh)
    path = "../data/processed/train_negatives/patches_as_jpg_1000/"+slide+'/'
    for patch in liste:
        if '.ipynb' not in patch:
            l = patch.split('_')
            image, label = patch, 0
            p, s, x1, y1, x2, y2 = l[0], l[0]+"_"+l[1], l[2], l[3].split('.')[0], int(l[2])+512, int(l[3].split('.')[0])+512
            neg_df.loc[k] = [p, s+'.tif', x1, y1, x2, y2, image, label, path]
            k+=1
neg_df = neg_df.dropna()

In [None]:
df_valid = pd.read_csv('../data/raw/validation.csv')
df_valid['patient'] = df_valid['filename'].apply(lambda x: x.split('_')[0])

In [None]:
len(df_valid['patient'].unique())==131

In [None]:
final_df = pd.concat([pos_df, neg_df], ignore_index=True)
train_patches = final_df.loc[final_df['filename'].isin(df_train['filename'])]
test_patches = final_df.loc[final_df['filename'].isin(df_test['filename'])]

In [None]:
print(set(train_patches['patient'].values) & set(test_patches['patient'].values))

In [None]:
print(set(train_patches['filename'].values) & set(test_patches['filename'].values))

In [None]:
train_patches.to_csv('../data/processed/dataframe_training_selected.csv', index=False)
test_patches.to_csv('../data/processed/dataframe_testing_selected.csv', index=False)

In [None]:
k=0
for slide in tqdm(df_valid['filename'].unique()):
    try:
        liste = os.listdir("../data/processed/val/patches_as_jpg_full/"+slide)
        path = "../data/processed/val/patches_as_jpg_full/"+slide+'/'
        for patch in liste:
            if '.ipynb' not in patch:
                k+=1
    except Exception as e:
        print(e, slide)

In [None]:
cols = ["patient", "filename", "image", "path"]
val_df = pd.DataFrame(columns=cols, index=range(k+1))

k=0
for slide in tqdm(df_valid['filename'].unique()):
    try:
        liste = os.listdir("../data/processed/val/patches_as_jpg_full/"+slide)
        path = "../data/processed/val/patches_as_jpg_full/"+slide+'/'
        for patch in liste:
            if '.ipynb' not in patch:
                l = patch.split('_')
                image = patch
                p, s = l[0], l[0]+"_"+l[1]
                val_df.loc[k] = [p, s+'.tif', image, path]
                k+=1
    except Exception as e:
        print(e, slide)
val_df = val_df.dropna()

In [None]:
val_df.to_csv('../data/processed/dataframe_validation_selected.csv', index=False)
val_df

# 9) 5-Folds Cross-validation

In [3]:
from sklearn.model_selection import StratifiedKFold

seed=2024
random.seed(seed)
np.random.seed(seed)

In [4]:
n_splits = 5
def create_stratified_folds(df, target_column, n_splits, seed=seed):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    df['fold'] = -1
    for fold, (_, val_idx) in enumerate(skf.split(df, df[target_column])):
        df.loc[val_idx, 'fold'] = fold
    return df

In [7]:
train_patches = pd.read_csv('../data/processed/dataframe_training_selected.csv')
test_patches = pd.read_csv('../data/processed/dataframe_testing_selected.csv')
final_df = pd.concat([train_patches, test_patches], ignore_index=True).sample(frac=1)

df_train = pd.read_csv('../data/raw/train.csv')
df_lesions = pd.read_csv('../data/raw/presence_of_lesion.csv')
print(df_train.shape, df_lesions.shape)

df_train['patient'] = df_train['filename'].apply(lambda x: x.split('_')[0])
df_train['width']=df_train['x2']-df_train['x1']
df_train['height']=df_train['y2']-df_train['y1']

df_lesions['patient'] = df_lesions['file_name'].apply(lambda x: x.split('.')[0])
all_patients = len(df_train["patient"].unique())
df_wrong = pd.read_csv('../data/raw/wrong_bbox.csv')
print('raw data:')
print("train patients", len(df_train['patient'].unique()), "train slides",len(df_train['filename'].unique()),)

indices = []
for p in df_wrong.filename.unique():
    for x1,x2, y1, y2 in df_wrong.loc[df_wrong["filename"]==p][["x1", "x2", "y1", "y2"]].values:
        indices.append(df_train.loc[(df_train["filename"]==p) & (df_train["x1"]==int(x1))].index.item())
df_train = df_train.drop(indices)

print("\neliminate wrong Bboxes:")
print("train patients", len(df_train['patient'].unique()), "train slides",len(df_train['filename'].unique()),)
print("train no label in presence_of_lesion:",set(df_train['patient'].unique())-set(df_lesions['patient'].unique()))

train_cols = ['patient','filename', 'x1', 'x2', 'y1', 'y2', 'max_x', 'max_y', "width", "height", 'presence_of_lesion']
df_train = pd.merge(df_train, df_lesions, on=["patient"])[train_cols]


df_with_folds = create_stratified_folds(df_train, n_splits=n_splits, target_column='presence_of_lesion')
Path("../data/processed/crossval_sets").mkdir(parents=True, exist_ok=True)
for fold in range(n_splits):
    print('fold:', fold)
    df_train_cv = df_with_folds[df_with_folds['fold'] != fold]
    df_test_cv = df_with_folds[df_with_folds['fold'] == fold]
    train_slides = df_train_cv['filename'].unique().tolist()
    test_slides = df_test_cv['filename'].unique().tolist()

    print("train patients", len(df_train_cv['patient'].unique()),', train slides:', len(train_slides))
    print("test patients", len(df_test_cv['patient'].unique()),', test slides:', len(test_slides))
    train_patches = final_df.loc[final_df['filename'].isin(df_train_cv['filename'])]
    test_patches = final_df.loc[final_df['filename'].isin(df_test_cv['filename'])]
    print('train patches:', len(train_patches), 'test patches:', len(test_patches))
    train_patches.to_csv('../data/processed/crossval_sets/dataframe_training_Fold' + str(fold) + '.csv', index=False)
    test_patches.to_csv('../data/processed/crossval_sets/dataframe_testing_Fold' + str(fold) + '.csv', index=False)
    print('\n')

(927, 7) (2318, 2)
raw data:
train patients 189 train slides 251

eliminate wrong Bboxes:
train patients 185 train slides 247
train no label in presence_of_lesion: {'j2SwPa04fq', 'cXmgCgA2fB'}
fold: 0
train patients 170 , train slides: 230
test patients 103 , test slides: 121
train patches: 69779 test patches: 39643


fold: 1
train patients 178 , train slides: 235
test patients 97 , test slides: 111
train patches: 72300 test patches: 36389


fold: 2
train patients 168 , train slides: 225
test patients 99 , test slides: 113
train patches: 66908 test patches: 35667


fold: 3
train patients 179 , train slides: 234
test patients 91 , test slides: 106
train patches: 70413 test patches: 35270


fold: 4
train patients 172 , train slides: 226
test patients 96 , test slides: 120
train patches: 69624 test patches: 42513


