# Medical Image Classification - Exploratory Data Analysis

This notebook performs exploratory data analysis on the Chest X-Ray Images (Pneumonia) dataset.

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import cv2
from tqdm import tqdm

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## 1. Dataset Overview

In [None]:
# Dataset paths
data_dir = '../data'
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')

# Count images
def count_images(directory):
    counts = {}
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            counts[class_name] = len([f for f in os.listdir(class_path) 
                                     if f.endswith(('.png', '.jpg', '.jpeg'))])
    return counts

train_counts = count_images(train_dir)
test_counts = count_images(test_dir)

print("Training Set:")
for class_name, count in train_counts.items():
    print(f"  {class_name}: {count}")

print("\nTest Set:")
for class_name, count in test_counts.items():
    print(f"  {class_name}: {count}")

## 2. Class Distribution

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Training set
axes[0].bar(train_counts.keys(), train_counts.values(), color=['green', 'red'])
axes[0].set_title('Training Set Class Distribution', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Number of Images')
axes[0].set_xlabel('Class')

# Test set
axes[1].bar(test_counts.keys(), test_counts.values(), color=['green', 'red'])
axes[1].set_title('Test Set Class Distribution', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Number of Images')
axes[1].set_xlabel('Class')

plt.tight_layout()
plt.show()

# Calculate class imbalance ratio
if 'PNEUMONIA' in train_counts and 'NORMAL' in train_counts:
    imbalance_ratio = train_counts['PNEUMONIA'] / train_counts['NORMAL']
    print(f"\nClass Imbalance Ratio (PNEUMONIA/NORMAL): {imbalance_ratio:.2f}")

## 3. Image Properties Analysis

In [None]:
def analyze_images(directory, num_samples=100):
    """Analyze image properties (size, dimensions, etc.)"""
    properties = {
        'widths': [],
        'heights': [],
        'aspect_ratios': [],
        'file_sizes': []
    }
    
    sample_count = 0
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            for filename in os.listdir(class_path)[:num_samples//2]:
                if filename.endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(class_path, filename)
                    img = Image.open(img_path)
                    width, height = img.size
                    
                    properties['widths'].append(width)
                    properties['heights'].append(height)
                    properties['aspect_ratios'].append(width / height)
                    properties['file_sizes'].append(os.path.getsize(img_path) / 1024)  # KB
                    
                    sample_count += 1
    
    return properties

print("Analyzing image properties...")
props = analyze_images(train_dir, num_samples=200)

print(f"\nImage Dimensions:")
print(f"  Width: {np.mean(props['widths']):.0f} ± {np.std(props['widths']):.0f} px")
print(f"  Height: {np.mean(props['heights']):.0f} ± {np.std(props['heights']):.0f} px")
print(f"  Aspect Ratio: {np.mean(props['aspect_ratios']):.2f} ± {np.std(props['aspect_ratios']):.2f}")
print(f"  File Size: {np.mean(props['file_sizes']):.1f} ± {np.std(props['file_sizes']):.1f} KB")

## 4. Sample Images Visualization

In [None]:
def display_samples(directory, num_samples=8):
    """Display sample images from each class"""
    classes = [d for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
    
    fig, axes = plt.subplots(len(classes), num_samples//2, figsize=(16, 8))
    
    for i, class_name in enumerate(classes):
        class_path = os.path.join(directory, class_name)
        images = [f for f in os.listdir(class_path) if f.endswith(('.png', '.jpg', '.jpeg'))]
        
        for j in range(num_samples//2):
            if j < len(images):
                img_path = os.path.join(class_path, images[j])
                img = Image.open(img_path)
                
                axes[i, j].imshow(img, cmap='gray')
                axes[i, j].axis('off')
                if j == 0:
                    axes[i, j].set_title(class_name, fontsize=12, fontweight='bold')
    
    plt.tight_layout()
    plt.show()

print("Sample Images:")
display_samples(train_dir)

## 5. Pixel Intensity Analysis

In [None]:
def analyze_pixel_intensity(directory, num_samples=50):
    """Analyze pixel intensity distribution by class"""
    intensity_data = {}
    
    for class_name in os.listdir(directory):
        class_path = os.path.join(directory, class_name)
        if os.path.isdir(class_path):
            intensities = []
            images = [f for f in os.listdir(class_path) if f.endswith(('.png', '.jpg', '.jpeg'))]
            
            for filename in images[:num_samples]:
                img_path = os.path.join(class_path, filename)
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
                if img is not None:
                    intensities.extend(img.flatten())
            
            intensity_data[class_name] = intensities
    
    return intensity_data

print("Analyzing pixel intensities...")
intensities = analyze_pixel_intensity(train_dir)

# Plot intensity distributions
plt.figure(figsize=(12, 6))
for class_name, data in intensities.items():
    plt.hist(data, bins=50, alpha=0.6, label=class_name, density=True)

plt.xlabel('Pixel Intensity', fontsize=12)
plt.ylabel('Density', fontsize=12)
plt.title('Pixel Intensity Distribution by Class', fontsize=14, fontweight='bold')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

# Print statistics
print("\nPixel Intensity Statistics:")
for class_name, data in intensities.items():
    print(f"  {class_name}: Mean={np.mean(data):.1f}, Std={np.std(data):.1f}")

## 6. Summary Statistics

In [None]:
# Create summary dataframe
summary_data = []
for split, counts in [('Train', train_counts), ('Test', test_counts)]:
    for class_name, count in counts.items():
        summary_data.append({
            'Split': split,
            'Class': class_name,
            'Count': count,
            'Percentage': f"{count / sum(counts.values()) * 100:.1f}%"
        })

summary_df = pd.DataFrame(summary_data)
print("\nDataset Summary:")
print(summary_df.to_string(index=False))

print("\n" + "="*80)
print("Key Observations:")
print("="*80)
print("1. The dataset shows class imbalance with more pneumonia cases")
print("2. Images vary in size and require standardization for model input")
print("3. Pixel intensity distributions differ between classes")
print("4. Data augmentation will be crucial for improving generalization")
print("="*80)