In [1]:
import os
import cv2
import numpy as np
from collections import defaultdict

def is_noisy(image_path, blur_thresh=100.0, var_thresh=10.0, entropy_thresh=3.0):
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            return True  # unreadable or empty image

        # 1. Blur detection using Laplacian variance
        lap_var = cv2.Laplacian(img, cv2.CV_64F).var()
        if lap_var < blur_thresh:
            return True

        # 2. Pixel variance (flat images)
        pixel_var = np.var(img)
        if pixel_var < var_thresh:
            return True

        # 3. Entropy (information content)
        histogram = cv2.calcHist([img], [0], None, [256], [0, 256]).ravel()
        histogram /= histogram.sum()  # normalize
        histogram = histogram[histogram > 0]  # remove zeros to avoid log(0)
        entropy = -np.sum(histogram * np.log2(histogram))
        if entropy < entropy_thresh:
            return True

        return False
    except:
        return True  # treat unreadable as noisy

def count_noisy_images(folder_path):
    noisy_counts = defaultdict(int)
    total_counts = defaultdict(int)

    for class_dir in os.listdir(folder_path):
        class_path = os.path.join(folder_path, class_dir)
        if not os.path.isdir(class_path):
            continue

        for fname in os.listdir(class_path):
            fpath = os.path.join(class_path, fname)
            total_counts[class_dir] += 1

            if is_noisy(fpath):
                noisy_counts[class_dir] += 1

    return noisy_counts, total_counts

# Set your dataset path
dataset_path = "augmented_balanced_data"
noisy_counts, total_counts = count_noisy_images(dataset_path)

# Print the summary
print("\n📊 Noisy Image Count Per Class:")
for cls in total_counts:
    noisy = noisy_counts[cls]
    total = total_counts[cls]
    print(f"  {cls}: {noisy}/{total} images flagged as noisy ({(noisy/total)*100:.2f}%)")




📊 Noisy Image Count Per Class:
  CNV: 1188/51390 images flagged as noisy (2.31%)
  DME: 9363/51390 images flagged as noisy (18.22%)
  DRUSEN: 9864/51390 images flagged as noisy (19.19%)
  NORMAL: 129/51390 images flagged as noisy (0.25%)


In [2]:
import os
import cv2
import numpy as np
import shutil
from tqdm import tqdm
from collections import defaultdict
import albumentations as A

# --- Noisy Image Detection Function ---
def is_noisy(image_path, blur_thresh=100.0, var_thresh=10.0, entropy_thresh=3.0):
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            return True

        lap_var = cv2.Laplacian(img, cv2.CV_64F).var()
        if lap_var < blur_thresh:
            return True

        pixel_var = np.var(img)
        if pixel_var < var_thresh:
            return True

        histogram = cv2.calcHist([img], [0], None, [256], [0, 256]).ravel()
        histogram /= histogram.sum()
        histogram = histogram[histogram > 0]
        entropy = -np.sum(histogram * np.log2(histogram))
        if entropy < entropy_thresh:
            return True

        return False
    except:
        return True

# --- Augmentation Setup (NO ZOOM) ---
transform = A.Compose([
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.5),
    A.Rotate(limit=15, p=0.5),
    A.GaussianBlur(p=0.3),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=0, p=0.5)
])

# --- Clean and Augment Dataset ---
def clean_and_augment_data(dataset_path, output_path):
    os.makedirs(output_path, exist_ok=True)
    class_counts = defaultdict(int)
    image_paths = defaultdict(list)

    # 1. Remove noisy images and copy clean ones
    for class_name in os.listdir(dataset_path):
        class_path = os.path.join(dataset_path, class_name)
        if not os.path.isdir(class_path):
            continue

        clean_class_path = os.path.join(output_path, class_name)
        os.makedirs(clean_class_path, exist_ok=True)

        for fname in os.listdir(class_path):
            img_path = os.path.join(class_path, fname)
            if not is_noisy(img_path):
                dest_path = os.path.join(clean_class_path, fname)
                shutil.copy(img_path, dest_path)
                class_counts[class_name] += 1
                image_paths[class_name].append(dest_path)

    # 2. Determine max count
    max_count = max(class_counts.values())

    # 3. Perform augmentation to balance all classes
    print("\n🔁 Performing Augmentation to Balance Classes...")
    for class_name, paths in tqdm(image_paths.items()):
        current_count = class_counts[class_name]
        class_dir = os.path.join(output_path, class_name)
        img_idx = 0
        while current_count < max_count:
            img_path = paths[img_idx % len(paths)]
            image = cv2.imread(img_path)
            if image is None:
                img_idx += 1
                continue

            aug = transform(image=image)
            aug_img = aug['image']
            new_name = f"aug_{current_count}_{os.path.basename(img_path)}"
            cv2.imwrite(os.path.join(class_dir, new_name), aug_img)
            current_count += 1
            img_idx += 1

    print("\n✅ Dataset cleaned and balanced with augmentation.")

# --- Run ---
dataset_path = "augmented_balanced_data"
output_path = "clean_balanced_data"
clean_and_augment_data(dataset_path, output_path)


  check_for_updates()
  original_init(self, **validated_kwargs)



🔁 Performing Augmentation to Balance Classes...


100%|██████████| 4/4 [03:49<00:00, 57.38s/it]


✅ Dataset cleaned and balanced with augmentation.





In [3]:
import os
import cv2
import numpy as np
from collections import defaultdict

def is_noisy(image_path, blur_thresh=100.0, var_thresh=10.0, entropy_thresh=3.0):
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            return True  # unreadable or empty image

        # 1. Blur detection using Laplacian variance
        lap_var = cv2.Laplacian(img, cv2.CV_64F).var()
        if lap_var < blur_thresh:
            return True

        # 2. Pixel variance (flat images)
        pixel_var = np.var(img)
        if pixel_var < var_thresh:
            return True

        # 3. Entropy (information content)
        histogram = cv2.calcHist([img], [0], None, [256], [0, 256]).ravel()
        histogram /= histogram.sum()  # normalize
        histogram = histogram[histogram > 0]  # remove zeros to avoid log(0)
        entropy = -np.sum(histogram * np.log2(histogram))
        if entropy < entropy_thresh:
            return True

        return False
    except:
        return True  # treat unreadable as noisy

def count_noisy_images(folder_path):
    noisy_counts = defaultdict(int)
    total_counts = defaultdict(int)

    for class_dir in os.listdir(folder_path):
        class_path = os.path.join(folder_path, class_dir)
        if not os.path.isdir(class_path):
            continue

        for fname in os.listdir(class_path):
            fpath = os.path.join(class_path, fname)
            total_counts[class_dir] += 1

            if is_noisy(fpath):
                noisy_counts[class_dir] += 1

    return noisy_counts, total_counts

# Set your dataset path
dataset_path = "clean_balanced_data"
noisy_counts, total_counts = count_noisy_images(dataset_path)

# Print the summary
print("\n📊 Noisy Image Count Per Class:")
for cls in total_counts:
    noisy = noisy_counts[cls]
    total = total_counts[cls]
    print(f"  {cls}: {noisy}/{total} images flagged as noisy ({(noisy/total)*100:.2f}%)")




📊 Noisy Image Count Per Class:
  CNV: 401/51261 images flagged as noisy (0.78%)
  DME: 3600/51261 images flagged as noisy (7.02%)
  DRUSEN: 3733/51261 images flagged as noisy (7.28%)
  NORMAL: 0/51261 images flagged as noisy (0.00%)


In [9]:
import os
import cv2
import numpy as np
import shutil
from collections import defaultdict
import random

def is_noisy(image_path, blur_thresh=100.0, var_thresh=10.0, entropy_thresh=3.0):
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            return True  # unreadable or empty image

        lap_var = cv2.Laplacian(img, cv2.CV_64F).var()
        if lap_var < blur_thresh:
            return True

        pixel_var = np.var(img)
        if pixel_var < var_thresh:
            return True

        histogram = cv2.calcHist([img], [0], None, [256], [0, 256]).ravel()
        histogram /= histogram.sum()
        histogram = histogram[histogram > 0]
        entropy = -np.sum(histogram * np.log2(histogram))
        if entropy < entropy_thresh:
            return True

        return False
    except:
        return True

def remove_noisy_and_balance(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    class_images = defaultdict(list)

    print("🔍 Scanning and filtering noisy images...")
    for class_name in os.listdir(input_dir):
        class_path = os.path.join(input_dir, class_name)
        if not os.path.isdir(class_path):
            continue

        clean_images = []
        for img_name in os.listdir(class_path):
            img_path = os.path.join(class_path, img_name)
            if not is_noisy(img_path):
                clean_images.append(img_path)

        class_images[class_name] = clean_images
        print(f"  {class_name}: {len(clean_images)} clean images")

    # Find the minimum count
    min_count = min(len(imgs) for imgs in class_images.values())
    print(f"\n🎯 Target image count per class (based on smallest class): {min_count}\n")

    print("📦 Copying balanced dataset...")
    for class_name, images in class_images.items():
        selected_images = random.sample(images, min_count)
        dest_class_dir = os.path.join(output_dir, class_name)
        os.makedirs(dest_class_dir, exist_ok=True)

        for src_path in selected_images:
            dst_path = os.path.join(dest_class_dir, os.path.basename(src_path))
            shutil.copy(src_path, dst_path)

        print(f"  {class_name}: {len(selected_images)} images copied")

    print("\n✅ Done! Noisy images removed and classes balanced.")

# === Usage ===
input_dataset = "clean_balanced_data"              # Your original dataset
output_dataset = "clean_balanced_data_balanced"    # Output after filtering and balancing
remove_noisy_and_balance(input_dataset, output_dataset)


🔍 Scanning and filtering noisy images...
  CNV: 50860 clean images
  DME: 47661 clean images
  DRUSEN: 47528 clean images
  NORMAL: 51261 clean images

🎯 Target image count per class (based on smallest class): 47528

📦 Copying balanced dataset...
  CNV: 47528 images copied
  DME: 47528 images copied
  DRUSEN: 47528 images copied
  NORMAL: 47528 images copied

✅ Done! Noisy images removed and classes balanced.


In [10]:
import os
import cv2
import numpy as np
from collections import defaultdict

def is_noisy(image_path, blur_thresh=100.0, var_thresh=10.0, entropy_thresh=3.0):
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            return True  # unreadable or empty image

        # 1. Blur detection using Laplacian variance
        lap_var = cv2.Laplacian(img, cv2.CV_64F).var()
        if lap_var < blur_thresh:
            return True

        # 2. Pixel variance (flat images)
        pixel_var = np.var(img)
        if pixel_var < var_thresh:
            return True

        # 3. Entropy (information content)
        histogram = cv2.calcHist([img], [0], None, [256], [0, 256]).ravel()
        histogram /= histogram.sum()  # normalize
        histogram = histogram[histogram > 0]  # remove zeros to avoid log(0)
        entropy = -np.sum(histogram * np.log2(histogram))
        if entropy < entropy_thresh:
            return True

        return False
    except:
        return True  # treat unreadable as noisy

def count_noisy_images(folder_path):
    noisy_counts = defaultdict(int)
    total_counts = defaultdict(int)

    for class_dir in os.listdir(folder_path):
        class_path = os.path.join(folder_path, class_dir)
        if not os.path.isdir(class_path):
            continue

        for fname in os.listdir(class_path):
            fpath = os.path.join(class_path, fname)
            total_counts[class_dir] += 1

            if is_noisy(fpath):
                noisy_counts[class_dir] += 1

    return noisy_counts, total_counts

# Set your dataset path
dataset_path = "clean_balanced_data_balanced"
noisy_counts, total_counts = count_noisy_images(dataset_path)

# Print the summary
print("\n📊 Noisy Image Count Per Class:")
for cls in total_counts:
    noisy = noisy_counts[cls]
    total = total_counts[cls]
    print(f"  {cls}: {noisy}/{total} images flagged as noisy ({(noisy/total)*100:.2f}%)")




📊 Noisy Image Count Per Class:
  CNV: 0/47528 images flagged as noisy (0.00%)
  DME: 0/47528 images flagged as noisy (0.00%)
  DRUSEN: 0/47528 images flagged as noisy (0.00%)
  NORMAL: 0/47528 images flagged as noisy (0.00%)


In [1]:
import os
import cv2
import numpy as np
import shutil
from collections import defaultdict
import random

def is_noisy(image_path, blur_thresh=100.0, var_thresh=10.0, entropy_thresh=3.0):
    try:
        img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            return True  # unreadable or empty image

        lap_var = cv2.Laplacian(img, cv2.CV_64F).var()
        if lap_var < blur_thresh:
            return True

        pixel_var = np.var(img)
        if pixel_var < var_thresh:
            return True

        histogram = cv2.calcHist([img], [0], None, [256], [0, 256]).ravel()
        histogram /= histogram.sum()
        histogram = histogram[histogram > 0]
        entropy = -np.sum(histogram * np.log2(histogram))
        if entropy < entropy_thresh:
            return True

        return False
    except:
        return True

def remove_noisy_and_balance(input_dir, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    class_images = defaultdict(list)

    print("🔍 Scanning and filtering noisy images...")
    for class_name in os.listdir(input_dir):
        class_path = os.path.join(input_dir, class_name)
        if not os.path.isdir(class_path):
            continue

        clean_images = []
        for img_name in os.listdir(class_path):
            img_path = os.path.join(class_path, img_name)
            if not is_noisy(img_path):
                clean_images.append(img_path)

        class_images[class_name] = clean_images
        print(f"  {class_name}: {len(clean_images)} clean images")

    # Find the minimum count
    min_count = min(len(imgs) for imgs in class_images.values())
    print(f"\n🎯 Target image count per class (based on smallest class): {min_count}\n")

    print("📦 Copying balanced dataset...")
    for class_name, images in class_images.items():
        selected_images = random.sample(images, min_count)
        dest_class_dir = os.path.join(output_dir, class_name)
        os.makedirs(dest_class_dir, exist_ok=True)

        for src_path in selected_images:
            dst_path = os.path.join(dest_class_dir, os.path.basename(src_path))
            shutil.copy(src_path, dst_path)

        print(f"  {class_name}: {len(selected_images)} images copied")

    print("\n✅ Done! Noisy images removed and classes balanced.")

# === Usage ===
input_dataset = "augmented_balanced_data"              # Your original dataset
output_dataset = "augmented_data_balanced_clean"    # Output after filtering and balancing
remove_noisy_and_balance(input_dataset, output_dataset)


🔍 Scanning and filtering noisy images...
  CNV: 50202 clean images
  DME: 42027 clean images
  DRUSEN: 41526 clean images
  NORMAL: 51261 clean images

🎯 Target image count per class (based on smallest class): 41526

📦 Copying balanced dataset...
  CNV: 41526 images copied
  DME: 41526 images copied
  DRUSEN: 41526 images copied
  NORMAL: 41526 images copied

✅ Done! Noisy images removed and classes balanced.


In [6]:
pip show requests


Name: requests
Version: 2.32.3
Summary: Python HTTP for Humans.
Home-page: https://requests.readthedocs.io
Author: Kenneth Reitz
Author-email: me@kennethreitz.org
License: Apache-2.0
Location: c:\users\dell\anaconda3\envs\eyediseaseclassification_env\lib\site-packages
Requires: certifi, charset-normalizer, idna, urllib3
Required-by: requests-oauthlib, streamlit, tensorboard
Note: you may need to restart the kernel to use updated packages.
