In [None]:
/media/microcrispr8/DATA 1/oscc-tcia

In [None]:
import openslide
import os
import numpy as np
import pandas as pd
from skimage.feature import canny
from skimage.morphology import binary_closing, binary_dilation, disk
from scipy.ndimage import binary_fill_holes
import h5py
import cv2
import shutil

# Function to open the slide
def open_slide(filename):
    """Attempt to open a slide file."""
    try:
        slide = openslide.OpenSlide(filename)
    except (openslide.OpenSlideError, openslide.OpenSlideUnsupportedFormatError, FileNotFoundError) as e:
        print(f"Error opening slide: {e}")
        slide = None
    return slide

# Function to get the zoom level with the highest resolution
def get_highest_resolution_level(slide):
    """Get the zoom level with the highest resolution by checking dimensions."""
    max_area = 0
    max_level = 0
    for level in range(slide.level_count):
        width, height = slide.level_dimensions[level]
        area = width * height
        if area > max_area:
            max_area = area
            max_level = level
    return max_level

# Function to determine if a tile should be kept based on tissue content
def keep_tile(tile, tissue_threshold):
    """Determine if a tile should be kept based on tissue content."""
    edges = np.zeros(tile.shape[:2], dtype=bool)
    for channel in range(tile.shape[2]):
        edges |= canny(tile[:, :, channel])
    tile_closed = binary_closing(edges, disk(10))
    tile_dilated = binary_dilation(tile_closed, disk(10))
    tile_filled = binary_fill_holes(tile_dilated)
    percentage = tile_filled.mean()
    return percentage >= tissue_threshold

# Function to save coordinates and tile data to an HDF5 file
def save_coordinates_and_tile(coords_dataset, tiles_dataset, col, row, tile):
    """Save tile coordinates and tile data to an HDF5 file."""
    coords_dataset.resize(coords_dataset.shape[0] + 1, axis=0)
    tiles_dataset.resize(tiles_dataset.shape[0] + 1, axis=0)
    coords_dataset[-1] = [col, row]
    tiles_dataset[-1] = tile

# Function to save the tile image as a PNG
def save_tile(tile, filename, col, row, slide_folder):
    """Save a tile as a PNG image in a slide-specific folder."""
    base_name = os.path.basename(filename).replace('.svs', '').replace('.ndpi', '')
    save_path = os.path.join(slide_folder, f"{base_name}_tile_{row}_{col}.png")
    print(f"Saving tile: {save_path}")
    cv2.imwrite(save_path, cv2.cvtColor(tile, cv2.COLOR_RGB2BGR))

# Function to process the slide and generate tiles
def process_slide(filename, tile_size, tissue_threshold, output_base_directory, relative_path, slide_info_csv):
    """Process an entire slide, saving tiles that meet criteria and saving images in specific folders."""
    try:
        slide = open_slide(filename)
        print("Processing slide :", filename)
        if slide is None:
            print(f"Failed to open slide: {filename}")
            return

        slide_name = os.path.splitext(os.path.basename(filename))[0]

        h5_save_folder = os.path.join(output_base_directory, 'h5', relative_path)
        patches_save_folder = os.path.join(output_base_directory, 'patches256', relative_path, slide_name)

        os.makedirs(h5_save_folder, exist_ok=True)
        os.makedirs(patches_save_folder, exist_ok=True)

        save_path = os.path.join(h5_save_folder, f"{slide_name}.h5")

        if os.path.exists(save_path):
            print(f"HDF5 file already exists: {save_path}, skipping this slide.")
            return

        # Get the highest resolution level
        highest_res_level = get_highest_resolution_level(slide)
        # highest_res_level = 0
        wsi_width, wsi_height = slide.level_dimensions[highest_res_level]

        print(f"Processing zoom level {highest_res_level} with dimensions: {wsi_width}x{wsi_height}")

        with h5py.File(save_path, 'w') as h5_file:
            coords_dataset = h5_file.create_dataset('tile_coordinates', (0, 2), maxshape=(None, 2), dtype='int32')
            tiles_dataset = h5_file.create_dataset('tile_data', (0, tile_size, tile_size, 3), maxshape=(None, tile_size, tile_size, 3), dtype='uint8')
            num_patches_saved = 0

            cols, rows = wsi_width // tile_size, wsi_height // tile_size

            for col in range(cols):
                for row in range(rows):
                    x = col * tile_size
                    y = row * tile_size

                    # Read the region at the highest resolution level
                    tile_image = slide.read_region((x, y), highest_res_level, (tile_size, tile_size)).convert("RGB")
                    tile = np.array(tile_image)

                    if keep_tile(tile, tissue_threshold):
                        save_coordinates_and_tile(coords_dataset, tiles_dataset, col, row, tile)
                        save_tile(tile, filename, col, row, patches_save_folder)
                        num_patches_saved += 1

            # Save slide info to the CSV
            slide_info = {
                'Slide Name': slide_name,
                'WSI Shape': f"{wsi_width}x{wsi_height}", 
                'Number of Patches': num_patches_saved,
                'Zoom Level Used': highest_res_level
            }

            # Append slide info to the CSV
            df_slide_info = pd.DataFrame([slide_info])
            df_slide_info.to_csv(slide_info_csv, mode='a', header=False, index=False)
            print(f"Slide {slide_name}: WSI dimensions: {wsi_width}x{wsi_height}, Total tiles saved: {num_patches_saved}, Zoom level: {highest_res_level}")

    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    # Input and output directory paths
    input_directory = "/media/microcrispr8/DATA 2/researchxwsi/rename-data/OSCC-TCIA/"
    output_base_directory = "/media/microcrispr8/DATA 2/ai-oscar/wsi-preprocessing-output/patches256-original/oscc-tcia-160"
    slide_info_csv = os.path.join(output_base_directory, "/media/microcrispr8/DATA 1/DSMIL/Input_data/patches/slide_info_TCIA-oscc-160.csv")

    # Create the CSV file and write the header
    if not os.path.exists(output_base_directory):
        os.makedirs(output_base_directory)

    if not os.path.exists(slide_info_csv):
        with open(slide_info_csv, 'w') as f:
            f.write("Slide Name,WSI Shape,Number of Patches,Zoom Level Used\n")

    # Process all .svs and .ndpi files in the directory structure
    for root, dirs, files in os.walk(input_directory):
        for file in files:
            if file.endswith(('.svs', '.ndpi')):
                relative_path = os.path.relpath(root, input_directory)
                process_slide(os.path.join(root, file), tile_size=256, tissue_threshold=0.75, output_base_directory=output_base_directory, relative_path=relative_path, slide_info_csv=slide_info_csv)

In [None]:
import openslide
import os
import numpy as np
import pandas as pd
from skimage.feature import canny
from skimage.morphology import binary_closing, binary_dilation, disk
from scipy.ndimage import binary_fill_holes
import h5py
import cv2
from multiprocessing import Pool, cpu_count

# Function to open a slide
def open_slide(filename):
    try:
        return openslide.OpenSlide(filename)
    except (openslide.OpenSlideError, openslide.OpenSlideUnsupportedFormatError, FileNotFoundError) as e:
        print(f"Error opening slide: {e}")
        return None

# Optimized function to determine if a tile should be kept
def keep_tile(tile, tissue_threshold):
    """Vectorized tissue detection for faster computation."""
    edges = np.max([canny(tile[:, :, i]) for i in range(tile.shape[2])], axis=0)
    tile_closed = binary_closing(edges, disk(10))
    tile_dilated = binary_dilation(tile_closed, disk(10))
    tile_filled = binary_fill_holes(tile_dilated)
    return tile_filled.mean() >= tissue_threshold

# Function to save the tile image as a PNG
def save_tile(tile, save_folder, col, row):
    """Save tiles as PNGs with compression to reduce disk I/O."""
    filename = os.path.join(save_folder, f"tile_{row}_{col}.png")
    cv2.imwrite(filename, cv2.cvtColor(tile, cv2.COLOR_RGB2BGR), [cv2.IMWRITE_PNG_COMPRESSION, 3])

# Function to process a single tile (for multiprocessing)
def process_tile(args):
    """Process a single tile - checks tissue, saves tile and coordinates"""
    x, y, col, row, slide_path, tile_size, tissue_threshold, patches_save_folder = args

    # Open slide inside worker function (Fixes pickling issue)
    slide = open_slide(slide_path)
    if slide is None:
        return None  # Skip if slide failed to open

    tile_image = slide.read_region((x, y), 0, (tile_size, tile_size)).convert("RGB")
    tile = np.array(tile_image)

    if keep_tile(tile, tissue_threshold):  # Check if the tile has enough tissue
        save_tile(tile, patches_save_folder, col, row)
        return (col, row, tile)
    
    return None  # Skip tiles that don't meet the threshold

# Function to process the slide using multiprocessing
def process_slide(filename, tile_size, tissue_threshold, output_base_directory, relative_path, slide_info_csv):
    """Process an entire slide, saving tiles using multiprocessing for faster execution."""
    try:
        slide = open_slide(filename)
        if slide is None:
            print(f"Failed to open slide: {filename}")
            return

        slide_name = os.path.splitext(os.path.basename(filename))[0]
        h5_save_folder = os.path.join(output_base_directory, 'h5', relative_path)
        patches_save_folder = os.path.join(output_base_directory, 'patches256', relative_path, slide_name)

        os.makedirs(h5_save_folder, exist_ok=True)
        os.makedirs(patches_save_folder, exist_ok=True)

        save_path = os.path.join(h5_save_folder, f"{slide_name}.h5")
        if os.path.exists(save_path):  # Fast check if HDF5 file exists
            print(f"HDF5 file already exists: {save_path}, skipping.")
            return

        wsi_width, wsi_height = slide.level_dimensions[0]  # Use highest resolution (Level 0)

        cols, rows = wsi_width // tile_size, wsi_height // tile_size
        num_workers = max(1, cpu_count() - 2)  # Use multiple CPU cores

        # Create list of tasks for multiprocessing
        tile_tasks = [(col * tile_size, row * tile_size, col, row, filename, tile_size, tissue_threshold, patches_save_folder)
                      for col in range(cols) for row in range(rows)]

        num_patches_saved = 0
        tile_data = []

        # Use multiprocessing to speed up tile processing
        with Pool(num_workers) as pool:
            results = pool.map(process_tile, tile_tasks)

        # Collect saved tile data
        for res in results:
            if res is not None:
                col, row, tile = res
                tile_data.append((col, row, tile))
                num_patches_saved += 1

        # Save tile coordinates and patches in HDF5
        with h5py.File(save_path, 'w') as h5_file:
            coords_dataset = h5_file.create_dataset('tile_coordinates', (len(tile_data), 2), dtype='int32')
            tiles_dataset = h5_file.create_dataset('tile_data', (len(tile_data), tile_size, tile_size, 3), dtype='uint8')

            for idx, (col, row, tile) in enumerate(tile_data):
                coords_dataset[idx] = [col, row]
                tiles_dataset[idx] = tile

        # Save slide info to CSV
        df_slide_info = pd.DataFrame([{
            'Slide Name': slide_name,
            'WSI Shape': f"{wsi_width}x{wsi_height}",
            'Number of Patches': num_patches_saved,
            'Zoom Level Used': 0  # Highest resolution
        }])

        df_slide_info.to_csv(slide_info_csv, mode='a', header=not os.path.exists(slide_info_csv), index=False)
        print(f"Slide {slide_name}: {num_patches_saved} tiles saved.")

    except Exception as e:
        print(f"An error occurred: {e}")

# Main execution
if __name__ == "__main__":
    # Paths
    already_processed_dir = "/media/microcrispr8/DATA 1/oscc-tcia"
    new_wsi_dir = "/media/microcrispr8/DATA 2/researchxwsi/rename-data/OSCC-TCIA/"
    output_base_directory = "/media/microcrispr8/DATA 2/ai-oscar/wsi-preprocessing-output/patches256-original/oscc-tcia-160"
    slide_info_csv = "/media/microcrispr8/DATA 1/DSMIL/Input_data/slide_info_TCIA-oscc-160.csv"

    # Ensure output directory exists
    os.makedirs(output_base_directory, exist_ok=True)

    # Create slide_info CSV with header if not exists
    if not os.path.exists(slide_info_csv):
        with open(slide_info_csv, 'w') as f:
            f.write("Slide Name,WSI Shape,Number of Patches,Zoom Level Used\n")

    # Get processed WSI filenames (without extensions)
    already_processed_wsi = {
        os.path.splitext(file)[0] for file in os.listdir(already_processed_dir) if file.endswith(('.svs', '.ndpi', '.h5'))
    }

    # Process new slides, skipping already done ones
    for root, dirs, files in os.walk(new_wsi_dir):
        for file in files:
            if file.endswith(('.svs', '.ndpi')):
                slide_name = os.path.splitext(file)[0]  

                if slide_name in already_processed_wsi:
                    print(f"Skipping {file} (already processed in {already_processed_dir})")
                    continue  

                relative_path = os.path.relpath(root, new_wsi_dir)
                process_slide(os.path.join(root, file), tile_size=256, tissue_threshold=0.75,
                              output_base_directory=output_base_directory, 
                              relative_path=relative_path, slide_info_csv=slide_info_csv)


Slide o-234-06-210: 1435 tiles saved.
Skipping o-234-06-011.svs (already processed in /media/microcrispr8/DATA 1/oscc-tcia)
Slide o-234-06-161: 959 tiles saved.
Slide o-234-06-171: 3931 tiles saved.
Slide o-234-06-117: 2434 tiles saved.
Skipping o-234-06-087.svs (already processed in /media/microcrispr8/DATA 1/oscc-tcia)
Skipping o-234-06-058.svs (already processed in /media/microcrispr8/DATA 1/oscc-tcia)
Slide o-234-06-232: 7053 tiles saved.
Slide o-234-06-186: 10014 tiles saved.
Slide o-234-06-219: 679 tiles saved.
Skipping o-234-06-014.svs (already processed in /media/microcrispr8/DATA 1/oscc-tcia)
Slide o-234-06-153: 2766 tiles saved.
Skipping o-234-06-049.svs (already processed in /media/microcrispr8/DATA 1/oscc-tcia)
Slide o-234-06-170: 1313 tiles saved.
Slide o-234-06-248: 865 tiles saved.
Skipping o-234-06-013.svs (already processed in /media/microcrispr8/DATA 1/oscc-tcia)
Slide o-234-06-130: 937 tiles saved.
Slide o-234-06-238: 7832 tiles saved.
Slide o-234-06-253: 1724 tiles 