In [1]:
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os
from PIL import Image
import hashlib
from shutil import copy2
from tqdm import tqdm



In [2]:
def file_hash(filepath):
    with open(filepath, 'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

def check_duplicates(set1, set2):
    hashes = {}
    duplicates = []

    # Process all files in both sets and store their hashes
    for dataset_path in [set1, set2]:
        for root, _, files in os.walk(dataset_path):
            for filename in files:
                if filename.endswith('jpg'):  # Add other file types if needed
                    file_path = os.path.join(root, filename)
                    filehash = file_hash(file_path)
                    if filehash in hashes:
                        duplicates.append((hashes[filehash], file_path))
                    else:
                        hashes[filehash] = file_path
    return duplicates

# Check for duplicates
duplicates = check_duplicates('raw_data/Training', 'raw_data/Testing')
if duplicates:
    print("Duplicates found:", len(duplicates))
    # for dup in duplicates:
    #     print(f"Duplicate: {dup[0]} and {dup[1]}")
else:
    print("No duplicates found.")

Duplicates found: 297


In [3]:
def preprocess_image(image_path, output_path):
    """Load, preprocess, and save the image."""
    # Load image
    with Image.open(image_path) as img:
        # Convert image to grayscale
        processed_img = img.convert('L')  # Change 'L' to 'RGB', etc., as needed
        
        image_np = np.array(processed_img)
        image_np = cv2.bilateralFilter(image_np, 2, 50, 50)
        image_np = cv2.resize(image_np, (200, 200))
        processed_img = Image.fromarray(image_np)

        # Save the processed image
        processed_img.save(output_path)

def find_unique_images(src_dir, dest_dir):
    conditions = ['glioma', 'meningioma', 'notumor', 'pituitary']
    hashes = {}

    for condition in conditions:
        condition_path = os.path.join(dest_dir, condition)
        if not os.path.exists(condition_path):
            os.makedirs(condition_path)

        # Process both Testing and Training folders
        for folder in ['Testing', 'Training']:
            current_path = os.path.join(src_dir, folder, condition)
            if os.path.exists(current_path):
                for file in tqdm(os.listdir(current_path), desc=f'Processing {condition} from {folder}'):
                    filepath = os.path.join(current_path, file)
                    try:
                        img_hash = file_hash(filepath)

                        if img_hash not in hashes:
                            hashes[img_hash] = filepath
                            output_path = os.path.join(condition_path, os.path.basename(file))
                            preprocess_image(filepath, output_path)
                    except IOError:
                        print(f"Skipped non-image file: {filepath}")

src_directory = './raw_data'
dest_directory = './preprocessed_data'

find_unique_images(src_directory, dest_directory)

Processing glioma from Testing: 100%|██████████| 300/300 [00:00<00:00, 525.12it/s]
Processing glioma from Training: 100%|██████████| 1321/1321 [00:02<00:00, 623.45it/s]
Processing meningioma from Testing: 100%|██████████| 306/306 [00:00<00:00, 573.21it/s]
Processing meningioma from Training: 100%|██████████| 1339/1339 [00:02<00:00, 491.26it/s]
Processing notumor from Testing: 100%|██████████| 405/405 [00:00<00:00, 726.97it/s]
Processing notumor from Training: 100%|██████████| 1595/1595 [00:02<00:00, 789.06it/s]
Processing pituitary from Testing: 100%|██████████| 300/300 [00:00<00:00, 573.27it/s]
Processing pituitary from Training: 100%|██████████| 1457/1457 [00:02<00:00, 518.57it/s]
