This is the code used for pre-processing images from a given folder with 256*144 and Normalize the greyscale of the images.
The normalize_two_folders() function is the main operator. It takes two folder path, one main and one reference. The CDF of reference will be calculated, and the main folder's image will be resized and greyscale normalized to match that cdf, the new images will be stored as a subfolder in given folders. If want to process reference folder and normalize it with itself, use same path for both entries.

In [1]:
from PIL import Image
import os
import numpy as np

def compute_cdf(image):
    arr = np.array(image).ravel()
    hist, _ = np.histogram(arr, bins=256, range=(0, 255))
    cdf = hist.cumsum().astype(np.float32)
    cdf /= cdf[-1]
    return cdf

def match_histogram(source_img, ref_cdf):
    src = np.array(source_img).ravel()
    hist, _ = np.histogram(src, bins=256, range=(0, 255))
    src_cdf = hist.cumsum().astype(np.float32)
    src_cdf /= src_cdf[-1]
    lut = np.interp(src_cdf, ref_cdf, np.arange(256)).astype(np.uint8)
    matched = lut[src].reshape(source_img.size[::-1])
    return Image.fromarray(matched, mode='L')

def crop_grayscale(image):
    w, h = image.size
    cropped = image.crop((0, 0, w, int(h * 0.703)))
    return cropped.convert('L')

def ensure_horizontal(image):
    w, h = image.size
    if h > w:
        return image.rotate(-90, expand=True)
    return image

def collect_image_paths(folder_path):
    image_paths = []
    for dirpath, _, filenames in os.walk(folder_path):
        for fname in filenames:
            if fname.lower().endswith((".tif", ".jpg", ".jpeg", ".png", ".heic")):
                full_path = os.path.join(dirpath, fname)
                image_paths.append((dirpath, fname, full_path))
    return image_paths

def compute_average_cdf_from_folder(folder_path):
    all_cdfs = []
    image_paths = collect_image_paths(folder_path)

    for _, _, path in image_paths:
        try:
            with Image.open(path) as im:
                im = ensure_horizontal(im)
                gray = crop_grayscale(im)
                cdf = compute_cdf(gray)
                all_cdfs.append(cdf)
        except Exception as e:
            print(f"Skipping {path} due to error: {e}")

    if not all_cdfs:
        raise RuntimeError(f"No valid images found in {folder_path} to compute average CDF.")

    return np.mean(all_cdfs, axis=0)

def process_folder_with_cdf(input_folder, ref_cdf, output_suffix="processed_from_reference", target_size=(256, 144), flatten_output=False):
    image_paths = collect_image_paths(input_folder)

    output_dir = os.path.join(input_folder, output_suffix)
    os.makedirs(output_dir, exist_ok=True)

    for dirpath, fname, full_path in image_paths:
        try:
            with Image.open(full_path) as im:
                im = ensure_horizontal(im)
                gray = crop_grayscale(im)
                matched = match_histogram(gray, ref_cdf)
                resized = matched.resize(target_size)

                # Output path: flatten if requested
                if flatten_output:
                    out_path = os.path.join(output_dir, os.path.splitext(fname)[0] + ".jpg")
                else:
                    rel_path = os.path.relpath(dirpath, input_folder)
                    subdir = os.path.join(output_dir, rel_path)
                    os.makedirs(subdir, exist_ok=True)
                    out_path = os.path.join(subdir, os.path.splitext(fname)[0] + ".jpg")

                resized.save(out_path, format='JPEG', quality=100)
                print(f"Saved: {out_path}")
        except Exception as e:
            print(f"Error processing {full_path}: {e}")



In [3]:
def normalize_two_folders(main_folder, reference_folder):
    print(f"Computing average CDF from reference folder: {reference_folder}")
    avg_cdf = compute_average_cdf_from_folder(reference_folder)

    #print(f"\nProcessing reference folder with reference CDF...")
    #process_folder_with_cdf(reference_folder, avg_cdf, output_suffix="processed_from_reference", flatten_output=True)

    print(f"\nProcessing main folder with reference CDF...")
    process_folder_with_cdf(main_folder, avg_cdf, output_suffix="processed_from_reference", flatten_output=False)

# Example usage
main_folder = r"Preprocessed Images\Unlabeled SEM\processed_smaller_20K_normgrey"
reference_folder = r"Preprocessed Images\Labeled SEM\ourimg_normgrey"
normalize_two_folders(main_folder, reference_folder)


Computing average CDF from reference folder: Preprocessed Images\Labeled SEM\ourimg_normgrey

Processing main folder with reference CDF...
Saved: Preprocessed Images\Unlabeled SEM\processed_smaller_20K_normgrey\processed_from_reference\.\1-S1-no_area-50k-ordered.jpg
Saved: Preprocessed Images\Unlabeled SEM\processed_smaller_20K_normgrey\processed_from_reference\.\1-S4-A67-50k-disordered.jpg
Saved: Preprocessed Images\Unlabeled SEM\processed_smaller_20K_normgrey\processed_from_reference\.\10-S1-no_area-100k-ordered.jpg
Saved: Preprocessed Images\Unlabeled SEM\processed_smaller_20K_normgrey\processed_from_reference\.\10-S4-A51-50k-disordered.jpg
Saved: Preprocessed Images\Unlabeled SEM\processed_smaller_20K_normgrey\processed_from_reference\.\100-S1-no_area-100k-ordered.jpg
Saved: Preprocessed Images\Unlabeled SEM\processed_smaller_20K_normgrey\processed_from_reference\.\100-S3-A43-50k-disordered.jpg
Saved: Preprocessed Images\Unlabeled SEM\processed_smaller_20K_normgrey\processed_from_r