## Configuration

In [None]:
# 1. IMPORT LIBRARIES
import os
import cv2
import tifffile
import numpy as np
import random
import shutil
from tqdm import tqdm
from skimage.util import random_noise

print("‚úÖ All libraries imported successfully.")

# 2. CONFIGURE LOCAL GOOGLE DRIVE PATH
drive_path = "G:/My Drive/" 

# --- Define the specific dataset paths using the base drive_path ---
source_dir = os.path.join(drive_path, "Dataset/BioSR/Training Dataset")
destination_dir = os.path.join(drive_path, "Dataset/BioSR/Split Dataset")

# Verify that the path exists
if not os.path.exists(source_dir):
    print(f"‚ö†Ô∏è ERROR: The specified source directory does not exist: {source_dir}")
    print("Please update the 'drive_path' variable with the correct path to your synced Google Drive folder.")
else:
    print(f"‚úÖ Successfully located dataset at: {source_dir}")

## Generate LR dataset

In [None]:
# 1. CONFIGURE YOUR LOCAL PATH
drive_path = "C:/Users/milso/Documents/THESIS/"
source_dir = os.path.join(drive_path, "Dataset/Split Dataset")

# 2. GENERATE LOW-RESOLUTION IMAGE DATASETS

def generate_lr_datasets_overwrite():
    """
    Finds every 'ground_truth' folder and recreates 'lr_bicubic' and 
    'lr_realistic' sibling folders, overwriting any existing LR images.
    """
    
    num_frames = 5
    scale_factor = 4  # <-- change to 2 for x2 dataset generation
    max_subpixel_shift = 0.5

    blur_kernel_sizes = [7, 9, 11, 13, 15, 17, 19, 21]
    blur_sigma_range = (0.2, 3.0)
    noise_gauss_var_range = (0.0001, 0.005)

    if not os.path.exists(source_dir):
        print(f"‚ùå ERROR: The source directory does not exist: {source_dir}")
        return

    # --- Find all ground truth images ---
    print(f"üîç Scanning for ground truth images in: {source_dir}")
    hr_files_to_process = []
    for root, dirs, _ in os.walk(source_dir):
        if 'ground_truth' in dirs:
            gt_path = os.path.join(root, 'ground_truth')
            for f in os.listdir(gt_path):
                if f.endswith((".tif", ".tiff")):
                    hr_files_to_process.append(os.path.join(gt_path, f))
    
    total_gt_images = len(hr_files_to_process)
    if total_gt_images == 0:
        print("‚ö†Ô∏è  No 'ground_truth' images found. Please check your 'source_dir' path.")
        return
        
    print(f"‚úÖ Found {total_gt_images} ground truth images to process.")

    generated_count = 0
    failed_count = 0

    for hr_path in tqdm(hr_files_to_process, desc="Generating LR Frames"):
        try:
            hr_dir = os.path.dirname(hr_path)
            hr_filename = os.path.basename(hr_path)
            image_set_dir = os.path.dirname(hr_dir)
            base_filename = os.path.splitext(hr_filename)[0]

            # --- Output directories ---
            bicubic_dst_dir = os.path.join(image_set_dir, 'lr_bicubic')
            realistic_dst_dir = os.path.join(image_set_dir, 'lr_realistic')
            os.makedirs(bicubic_dst_dir, exist_ok=True)
            os.makedirs(realistic_dst_dir, exist_ok=True)

            # --- Read and normalize HR image ---
            hr_image = tifffile.imread(hr_path)
            if hr_image.dtype != np.uint8:
                hr_image = cv2.normalize(hr_image, None, 0, 255, cv2.NORM_MINMAX, dtype=cv2.CV_8U)

            rows, cols = hr_image.shape

            # --- Generate multiple LR variations ---
            for i in range(num_frames):
                # Subpixel shift
                shift_x = np.random.uniform(-max_subpixel_shift, max_subpixel_shift)
                shift_y = np.random.uniform(-max_subpixel_shift, max_subpixel_shift)
                M = np.float32([[1, 0, shift_x], [0, 1, shift_y]])
                shifted_hr = cv2.warpAffine(hr_image, M, (cols, rows), borderMode=cv2.BORDER_REFLECT_101)

                lr_height = rows // scale_factor
                lr_width = cols // scale_factor

                # --- Bicubic downsample ---
                bicubic_downsampled = cv2.resize(shifted_hr, (lr_width, lr_height), interpolation=cv2.INTER_CUBIC)

                # --- Realistic downsample (with blur + noise) ---
                kernel_size = random.choice(blur_kernel_sizes)
                sigma = random.uniform(*blur_sigma_range)
                gauss_var = random.uniform(*noise_gauss_var_range)

                blurred_hr = cv2.GaussianBlur(shifted_hr, (kernel_size, kernel_size), sigma)
                blurred_hr_float = blurred_hr.astype(np.float32) / 255.0
                noisy_gauss = random_noise(blurred_hr_float, mode='gaussian', var=gauss_var, clip=True)
                noisy_poisson = random_noise(noisy_gauss, mode='poisson', clip=True)
                degraded_hr = (noisy_poisson * 255).astype(np.uint8)
                realistic_downsampled = cv2.resize(degraded_hr, (lr_width, lr_height), interpolation=cv2.INTER_CUBIC)

                # --- Save both ---
                lr_filename = f"{base_filename}_{i+1:02d}.png"
                cv2.imwrite(os.path.join(bicubic_dst_dir, lr_filename), bicubic_downsampled)
                cv2.imwrite(os.path.join(realistic_dst_dir, lr_filename), realistic_downsampled)

            generated_count += 1

        except Exception as e:
            failed_count += 1
            tqdm.write(f"\n‚ö†Ô∏è Could not process {hr_path}. Error: {e}")

    print("\n=========================================")
    print("‚úÖ LR Dataset Generation Complete")
    print("-----------------------------------------")
    print(f"Total Ground Truth Images Found: {total_gt_images}")
    print(f"   Successfully Generated: {generated_count}")
    print(f"   Failed (Errors):        {failed_count}")
    print("=========================================")

# 3. RUN THE SCRIPT
if __name__ == "__main__":
    generate_lr_datasets_overwrite()


üîç Scanning for ground truth images in: C:/Users/milso/Documents/THESIS/Dataset/Split Dataset
‚úÖ Found 1092 ground truth images to process.


Generating LR Frames: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1092/1092 [39:41<00:00,  2.18s/it] 


‚úÖ LR Dataset Generation Complete
-----------------------------------------
Total Ground Truth Images Found: 1092
   Successfully Generated: 1092
   Failed (Errors):        0





## Split Dataset

In [3]:
import os
import random
import shutil
from tqdm import tqdm
from collections import defaultdict
from pathlib import Path
import math

# ==============================================================================
# 1. CONFIGURE YOUR LOCAL GOOGLE DRIVE PATHS
# ==============================================================================
# Update this path to where your Google Drive is synced locally.
drive_path = "C:/Users/milso/" 

# Path to the dataset you want to split
source_dir = os.path.join(drive_path, "Documents/THESIS/Dataset/Training Dataset")

# Path where the new 'train', 'val', and 'test' folders will be created
destination_dir = os.path.join(drive_path, "Documents/THESIS/Dataset/Split Dataset")

# ==============================================================================
# 2. SCRIPT TO SPLIT THE DATASET (ROBUST STRATIFIED SPLIT)
# ==============================================================================
def split_dataset():
    """
    Performs a robust 85/10/5 stratified split, treating each image set
    (e.g., SIM_gt_a, SIM_gt_b) as an independent unit.
    """

    # --- Parameters ---
    train_split = 0.85
    val_split = 0.10
    test_split = 0.05 
    random_seed = 42
    random.seed(random_seed)

    print(f"Scanning for all image sets in: {source_dir}")
    if not os.path.exists(source_dir):
        print(f"‚ö†Ô∏è ERROR: The source directory was not found: {source_dir}")
        return

    # --- Group by specimen, finding ALL folders that contain a 'ground_truth' sub-folder ---
    image_sets_by_specimen = defaultdict(list)
    for root, dirs, _ in os.walk(source_dir):
        # The fundamental unit for splitting is any folder containing 'ground_truth'.
        # This correctly treats 'SIM_gt_a' and 'SIM_gt_b' as separate items.
        if 'ground_truth' in dirs:
            image_set_path = root 
            relative_path = os.path.relpath(image_set_path, source_dir)
            specimen_name = Path(relative_path).parts[0]
            image_sets_by_specimen[specimen_name].append(image_set_path)

    if not image_sets_by_specimen:
        print("‚ö†Ô∏è No image sets with a 'ground_truth' folder were found.")
        return

    # --- Perform a robust stratified split for each group ---
    train_folders, val_folders, test_folders = [], [], []
    print("Performing robust 85:10:5 stratified split...")

    for specimen, image_set_paths in image_sets_by_specimen.items():
        # Shuffling the list of image sets (e.g., [..., 'SIM_gt_a', 'SIM_gt_b', ...])
        random.shuffle(image_set_paths)
        n_total = len(image_set_paths)

        if n_total < 4:
            train_folders.extend(image_set_paths)
            continue
            
        n_val = math.ceil(n_total * val_split)
        n_test = math.ceil(n_total * test_split)
        
        if n_val + n_test >= n_total:
            n_val = 1
            n_test = 1

        n_train = n_total - n_val - n_test

        train_folders.extend(image_set_paths[:n_train])
        val_folders.extend(image_set_paths[n_train : n_train + n_val])
        test_folders.extend(image_set_paths[n_train + n_val :])
        
    total_found = len(train_folders) + len(val_folders) + len(test_folders)
    print(f"Total image sets found and split: {total_found}")
    print(f"Splitting into: {len(train_folders)} Train, {len(val_folders)} Validation, {len(test_folders)} Test folders.")

    # --- Define and execute the copy process ---
    train_dest = os.path.join(destination_dir, 'train')
    val_dest = os.path.join(destination_dir, 'val')
    test_dest = os.path.join(destination_dir, 'test')

    def copy_folders(folder_list, destination_path):
        os.makedirs(destination_path, exist_ok=True)
        for src_path in tqdm(folder_list, desc=f"Copying to {os.path.basename(destination_path)}"):
            relative_path = os.path.relpath(src_path, source_dir)
            final_dest_path = os.path.join(destination_path, relative_path)
            os.makedirs(os.path.dirname(final_dest_path), exist_ok=True)
            if os.path.exists(src_path) and not os.path.exists(final_dest_path):
                shutil.copytree(src_path, final_dest_path)

    copy_folders(train_folders, train_dest)
    copy_folders(val_folders, val_dest)
    copy_folders(test_folders, test_dest)

    print("\n‚úÖ Dataset copying and splitting complete!")

# ==============================================================================
# 3. RUN THE SCRIPT
# ==============================================================================
if __name__ == "__main__":
    split_dataset()

Scanning for all image sets in: C:/Users/milso/Documents/THESIS/Dataset/Training Dataset
Performing robust 85:10:5 stratified split...
Total image sets found and split: 1092
Splitting into: 918 Train, 114 Validation, 60 Test folders.


Copying to train: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 918/918 [00:34<00:00, 26.94it/s]
Copying to val: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 114/114 [00:04<00:00, 26.70it/s]
Copying to test: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 60/60 [00:02<00:00, 27.95it/s]


‚úÖ Dataset copying and splitting complete!





##

In [None]:
\

In [None]:
import os

# --- Base path to start searching ---
base_path = r"C:/Users/milso/Documents/THESIS\Dataset/Training Dataset"

count = 0  # Counter for removed files

# --- Walk through all folders and subfolders ---
for root, dirs, files in os.walk(base_path):
    for file in files:
        if file.lower() == "desktop.ini":
            file_path = os.path.join(root, file)
            try:
                os.remove(file_path)
                count += 1
                print(f"üóëÔ∏è Removed: {file_path}")
            except Exception as e:
                print(f"‚ö†Ô∏è Could not remove {file_path}: {e}")

print(f"\n‚úÖ Done! Removed {count} desktop.ini file(s) in total.")


In [None]:
import os

# ==============================================================================
# CONFIGURATION
# ==============================================================================
# Update this path to your split dataset root
# Based on your previous code, it seems to be:
dataset_root = "C:/Users/milso/Documents/THESIS/Dataset/Split Dataset"

# ==============================================================================
# COUNTING FUNCTION
# ==============================================================================
def count_and_display_datasets(root_path, split_name):
    target_dir = os.path.join(root_path, split_name)
    
    if not os.path.exists(target_dir):
        print(f"‚ö†Ô∏è Error: Directory not found: {target_dir}")
        return

    bicubic_count = 0
    noisy_count = 0 # Maps to 'lr_realistic'

    # Walk through the directory tree
    for root, dirs, files in os.walk(target_dir):
        # Check if the current folder contains the specific subfolders
        if 'lr_bicubic' in dirs:
            bicubic_count += 1
        
        # In your file generation code, 'lr_realistic' corresponds to the noisy data
        if 'lr_realistic' in dirs:
            noisy_count += 1

    # --- DISPLAY OUTPUT ---
    print(f"--- {split_name.upper()} FOLDER ---")
    print(f"{bicubic_count} files in dataset Bicubic")
    print(f"{noisy_count} files in dataset Noisy")
    print("-" * 30)

# ==============================================================================
# EXECUTION
# ==============================================================================
if __name__ == "__main__":
    print(f"Scanning directory: {dataset_root}\n")
    
    # Count for Train folder
    count_and_display_datasets(dataset_root, 'train')
    
    # Count for Test folder
    count_and_display_datasets(dataset_root, 'test')