In [11]:
import os
import numpy as np
from PIL import Image
import re
import shutil

In [12]:
def check_image_corruption(image_path):
    try:
        with Image.open(image_path) as img:
            img.verify()
        return False
    except:
        return True

def check_naming_convention(filename, folder_type, tumor_type):
    prefix = "Tr-" if folder_type == "Training" else "Te-"
    pattern = fr"^{prefix}{tumor_type[0:2]}_(\d{{4}})\.jpg$"
    return re.match(pattern, filename) is not None

def get_correct_name(filename, folder_type, tumor_type):
    prefix = "Tr-" if folder_type == "Training" else "Te-"
    if tumor_type == "notumor":
        correct_prefix = f"{prefix}no_"
    else:
        correct_prefix = f"{prefix}{tumor_type[0:2]}_"
    
    number_match = re.search(r'(\d{4})\.jpg$', filename)
    number = number_match.group(1) if number_match else "0000"
    
    return f"{correct_prefix}{number}.jpg"

def assess_data_quality(base_folder):
    main_folders = ['Training', 'Testing']
    tumor_types = ['glioma', 'meningioma', 'notumor', 'pituitary']
    misnamed_images = []
    renamed_images = []

    for main_folder in main_folders:
        for tumor_type in tumor_types:
            folder_path = os.path.join(base_folder, main_folder, tumor_type)
            if not os.path.exists(folder_path):
                continue

            for file in os.listdir(folder_path):
                if file.lower().endswith('.jpg'):
                    image_path = os.path.join(folder_path, file)
                    
                    if not check_naming_convention(file, main_folder, tumor_type):
                        correct_name = get_correct_name(file, main_folder, tumor_type)
                        new_path = os.path.join(folder_path, correct_name)
                        shutil.move(image_path, new_path)
                        renamed_images.append((image_path, new_path))
                        misnamed_images.append(image_path)

    print(f"Total misnamed images: {len(misnamed_images)}")
    print(f"Total renamed images: {len(renamed_images)}")

if __name__ == "__main__":
    base_folder = "C:/Users/KhanhChang/PycharmProjects/BrainScan-TL-MRI-Tumor-Classifier/brain-tumor-mri-dataset"
    assess_data_quality(base_folder)

Total misnamed images: 0
Total renamed images: 0


In [14]:

def check_image_quality(image_path):
    try:
        with Image.open(image_path) as image:
            image_array = np.array(image)
            
            # Check Grayscale Mode 
            if image.mode != 'L':
                return False, "Not a grayscale image"
            
            # Check Image Resolution
            if image.size[0] < 128 or image.size[1] < 128:
                return False, "Unexpected image size"
            
            # Check Peak-to-peak (PTP)
            if np.ptp(image_array) < 50 or np.ptp(image_array) > 250:
                return False, "PTP is outside the expected range"
            
        return True, "Image passed quality checks"
    except Exception as e:
        return False, f"Error opening image: {str(e)}"
    
def process_dataset(data_dir):
    results = []
    categories = ['glioma', 'meningioma', 'notumor', 'pituitary']
    summary = {cat: {'total': 0, 'valid': 0, 'invalid': 0, 'error_reasons': {}} for cat in categories}
    summary['total'] = {'total': 0, 'valid': 0, 'invalid': 0, "error_reasons": {}}
    
    for category in categories:
        category_dir = os.path.join(data_dir, category)
        if not os.path.isdir(category_dir):
            print(f"Warning: {category} does not exist in {data_dir}")
            continue
        
        for file in os.listdir(category_dir):
            file_path = os.path.join(category_dir, file)
            if not os.path.isfile(file_path):
                continue
            
            summary[category]['total'] += 1
            summary['total']['total'] += 1
            
            if file.lower().endswith(('.png', '.jpg', '.jpeg')):
                is_valid, message = check_image_quality(file_path)
            else:
                is_valid, message = False, "Unsupported file format"
            
            if is_valid:
                summary[category]['valid'] += 1
                summary['total']['valid'] += 1
            else:
                summary[category]['invalid'] += 1
                summary['total']['invalid'] += 1
                summary[category]['error_reasons'][message] = summary[category]['error_reasons'].get(message, 0) + 1
                summary['total']['error_reasons'][message] = summary['total']['error_reasons'].get(message, 0) + 1
            results.append({
                'file_path': file_path,
                'category': category,
                'is_valid': is_valid,
                'message': message
            })
    return results, summary

def display_summary(summary, dataset_name):
    print(f"\n{dataset_name} Dataset Summary:")
    print("=" * 40)
    
    for category in summary:
        if category == 'total':
            continue
        total = summary[category]['total']
        valid = summary[category]['valid']
        invalid = summary[category]['invalid']
        
        print(f"\n{category.capitalize()}:")
        print(f"\tTotal files: {total}")
        print(f"\tValid files: {valid}")
        print(f"\tInvalid files: {invalid}")
        if total > 0:
            print(f"\tValidity percentage: {valid/total*100:.2f}%")
        else:
            print("\tValidity percentage: 0%")
            
        if summary[category]['error_reasons']:
            print("\tError Reasons:")
            for reason, count in summary[category]['error_reasons'].items():
                print(f"\t\t{reason}: {count}")
    
    print(f"\n{dataset_name} Overall Summary:")
    print("=" * 40)
    total = summary['total']['total']
    valid = summary['total']['valid']
    invalid = summary['total']['invalid']
    print(f"Total files across all categories: {total}")
    print(f"Total valid files: {valid}")
    print(f"Total invalid files: {invalid}")
    if total > 0:
        print(f"Overall Validity percentage: {valid/total*100:.2f}%")
    else:
        print(f"Overall Validity percentage: 0%")
        
    print("\nTotal Errors Reasons Across All Categories:")
    for reason, count in sorted(summary['total']['error_reasons'].items(), key=lambda x: x[1], reverse=True):
        print(f"\t{reason}: {count}")

def combine_summaries(summary1, summary2):
    combined = {cat: {'total': 0, 'valid': 0, 'invalid': 0, 'error_reasons': {}} for cat in summary1.keys()}
    for cat in combined:
        for key in ['total', 'valid', 'invalid']:
            combined[cat][key] = summary1[cat][key] + summary2[cat][key]
        combined[cat]['error_reasons'] = {reason: summary1[cat]['error_reasons'].get(reason, 0) + summary2[cat]['error_reasons'].get(reason, 0) 
                                          for reason in set(summary1[cat]['error_reasons']) | set(summary2[cat]['error_reasons'])}
    return combined

if __name__ == "__main__":
    base_directory = "C:/Users/KhanhChang/PycharmProjects/BrainScan-TL-MRI-Tumor-Classifier/brain-tumor-mri-dataset"
    training_directory = os.path.join(base_directory, "Training")
    testing_directory = os.path.join(base_directory, "Testing")
    
    print("Processing Training dataset:")
    _, training_summary = process_dataset(training_directory)
    display_summary(training_summary, "Training")
    
    print("\nProcessing Testing dataset:")
    _, testing_summary = process_dataset(testing_directory)
    display_summary(testing_summary, "Testing")
    
    print("\nOverall Dataset Summary (Training + Testing):")
    overall_summary = combine_summaries(training_summary, testing_summary)
    display_summary(overall_summary, "Overall")

Processing Training dataset:

Training Dataset Summary:

Glioma:
	Total files: 1321
	Valid files: 58
	Invalid files: 1263
	Validity percentage: 4.39%
	Error Reasons:
		Not a grayscale image: 121
		PTP is outside the expected range: 1142

Meningioma:
	Total files: 1339
	Valid files: 5
	Invalid files: 1334
	Validity percentage: 0.37%
	Error Reasons:
		Not a grayscale image: 791
		PTP is outside the expected range: 543

Notumor:
	Total files: 1595
	Valid files: 4
	Invalid files: 1591
	Validity percentage: 0.25%
	Error Reasons:
		Not a grayscale image: 1568
		PTP is outside the expected range: 23

Pituitary:
	Total files: 1457
	Valid files: 23
	Invalid files: 1434
	Validity percentage: 1.58%
	Error Reasons:
		Not a grayscale image: 760
		PTP is outside the expected range: 674

Training Overall Summary:
Total files across all categories: 5712
Total valid files: 90
Total invalid files: 5622
Overall Validity percentage: 1.58%

Total Errors Reasons Across All Categories:
	Not a grayscale image