# EMNIST Dataset Exploration

This notebook explores the EMNIST ByClass dataset used for handwritten character recognition.

**Dataset Information:**
- **EMNIST ByClass**: 62 classes (digits 0-9, uppercase A-Z, lowercase a-z)
- **Training samples**: ~697,932 images
- **Test samples**: ~116,323 images
- **Image size**: 28x28 grayscale

## 1. Import Libraries

In [None]:
import sys
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import our custom modules
from src.data.dataset import load_emnist
from src.utils.label_mapping import load_label_mapping

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ Libraries imported successfully")

## 2. Load Dataset

Load the EMNIST ByClass dataset using our custom loader.

In [None]:
# Load the dataset
x_train, y_train, x_test, y_test = load_emnist()

# Load label mapping
label_mapping = load_label_mapping()

print(f"Training set shape: {x_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test set shape: {x_test.shape}")
print(f"Test labels shape: {y_test.shape}")
print(f"\nNumber of classes: {len(np.unique(y_train))}")
print(f"Image dimensions: {x_train.shape[1]}x{x_train.shape[2]}")
print(f"Pixel value range: [{x_train.min()}, {x_train.max()}]")

## 3. Visualize Sample Images

Display sample images from different character classes to understand the data.

In [None]:
# Select one sample from each class
fig, axes = plt.subplots(8, 8, figsize=(16, 16))
fig.suptitle('EMNIST ByClass - Sample Images (62 Classes)', fontsize=16, y=0.995)

for class_idx in range(62):
    # Find first occurrence of this class
    sample_idx = np.where(y_train == class_idx)[0][0]
    image = x_train[sample_idx]
    character = label_mapping[class_idx]
    
    row = class_idx // 8
    col = class_idx % 8
    
    if row < 8 and col < 8:
        axes[row, col].imshow(image, cmap='gray')
        axes[row, col].set_title(f"'{character}' (idx:{class_idx})", fontsize=10)
        axes[row, col].axis('off')

# Hide extra subplots
for idx in range(62, 64):
    row = idx // 8
    col = idx % 8
    if row < 8:
        axes[row, col].axis('off')

plt.tight_layout()
plt.show()

## 4. Visualize Multiple Samples Per Class

Show variations within each class (different handwriting styles).

In [None]:
# Show multiple samples for selected classes
selected_classes = [5, 10, 20, 36, 50, 61]  # '5', 'A', 'K', 'a', 'o', 'z'
samples_per_class = 10

fig, axes = plt.subplots(len(selected_classes), samples_per_class, figsize=(15, 10))
fig.suptitle('Handwriting Variations Across Different Classes', fontsize=14)

for i, class_idx in enumerate(selected_classes):
    # Get indices for this class
    class_indices = np.where(y_train == class_idx)[0]
    # Randomly sample
    sample_indices = np.random.choice(class_indices, samples_per_class, replace=False)
    
    character = label_mapping[class_idx]
    
    for j, sample_idx in enumerate(sample_indices):
        axes[i, j].imshow(x_train[sample_idx], cmap='gray')
        axes[i, j].axis('off')
        
        if j == 0:
            axes[i, j].set_ylabel(f"'{character}'", fontsize=12, rotation=0, ha='right')

plt.tight_layout()
plt.show()

## 5. Class Distribution Analysis

Analyze how many samples exist for each character class.

In [None]:
# Count samples per class
unique_labels, counts = np.unique(y_train, return_counts=True)

# Create character labels for the plot
character_labels = [label_mapping[label] for label in unique_labels]

# Plot distribution
fig, ax = plt.subplots(figsize=(20, 6))
bars = ax.bar(range(len(counts)), counts, color='skyblue', edgecolor='navy', alpha=0.7)

# Color code by category
for i, label in enumerate(unique_labels):
    if label < 10:  # Digits
        bars[i].set_color('salmon')
    elif label < 36:  # Uppercase
        bars[i].set_color('lightgreen')
    else:  # Lowercase
        bars[i].set_color('lightblue')

ax.set_xlabel('Character Class', fontsize=12)
ax.set_ylabel('Number of Samples', fontsize=12)
ax.set_title('EMNIST ByClass - Sample Distribution', fontsize=14, pad=20)
ax.set_xticks(range(len(counts)))
ax.set_xticklabels(character_labels, rotation=45, ha='right')
ax.grid(axis='y', alpha=0.3)

# Add legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='salmon', label='Digits (0-9)'),
    Patch(facecolor='lightgreen', label='Uppercase (A-Z)'),
    Patch(facecolor='lightblue', label='Lowercase (a-z)')
]
ax.legend(handles=legend_elements, loc='upper right')

plt.tight_layout()
plt.show()

# Print statistics
print(f"Class Distribution Statistics:")
print(f"  Mean samples per class: {counts.mean():.0f}")
print(f"  Std deviation: {counts.std():.0f}")
print(f"  Min samples: {counts.min():,} (class '{label_mapping[unique_labels[counts.argmin()]]}')")
print(f"  Max samples: {counts.max():,} (class '{label_mapping[unique_labels[counts.argmax()]]}')")
print(f"  Imbalance ratio: {counts.max() / counts.min():.2f}x")

## 6. Pixel Intensity Distribution

Analyze the distribution of pixel values across the dataset.

In [None]:
# Sample a subset for faster computation
sample_size = 10000
sample_indices = np.random.choice(len(x_train), sample_size, replace=False)
sample_images = x_train[sample_indices]

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Histogram of pixel values
axes[0].hist(sample_images.flatten(), bins=50, color='steelblue', edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Pixel Value', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Pixel Values', fontsize=14)
axes[0].grid(axis='y', alpha=0.3)

# Average pixel intensity per image
avg_intensities = sample_images.mean(axis=(1, 2))
axes[1].hist(avg_intensities, bins=50, color='coral', edgecolor='darkred', alpha=0.7)
axes[1].set_xlabel('Average Pixel Intensity per Image', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Distribution of Average Image Intensities', fontsize=14)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Pixel Value Statistics (from {sample_size:,} sampled images):")
print(f"  Global mean: {sample_images.mean():.2f}")
print(f"  Global std: {sample_images.std():.2f}")
print(f"  Min value: {sample_images.min()}")
print(f"  Max value: {sample_images.max()}")

## 7. Data Quality Check

Check for potential issues like completely black/white images or outliers.

In [None]:
# Check for completely black or white images
black_images = np.sum(x_train.max(axis=(1, 2)) == 0)
white_images = np.sum(x_train.min(axis=(1, 2)) == 255)
low_variance = np.sum(x_train.std(axis=(1, 2)) < 5)

print("Data Quality Check:")
print(f"  Total training images: {len(x_train):,}")
print(f"  Completely black images: {black_images:,}")
print(f"  Completely white images: {white_images:,}")
print(f"  Low variance images (std < 5): {low_variance:,} ({100*low_variance/len(x_train):.2f}%)")

# Check label coverage
missing_labels = set(range(62)) - set(np.unique(y_train))
if missing_labels:
    print(f"  ⚠ Missing labels: {missing_labels}")
else:
    print(f"  ✓ All 62 classes are represented")

# Sample some low variance images to visualize
if low_variance > 0:
    low_var_indices = np.where(x_train.std(axis=(1, 2)) < 5)[0][:5]
    
    fig, axes = plt.subplots(1, min(5, len(low_var_indices)), figsize=(15, 3))
    fig.suptitle('Examples of Low Variance Images', fontsize=12)
    
    for i, idx in enumerate(low_var_indices):
        if len(low_var_indices) > 1:
            ax = axes[i]
        else:
            ax = axes
        ax.imshow(x_train[idx], cmap='gray')
        ax.set_title(f"Var: {x_train[idx].std():.2f}")
        ax.axis('off')
    
    plt.tight_layout()
    plt.show()

## 8. Key Findings and Observations

### Summary:

1. **Dataset Size**: 
   - Training: 697,932 images
   - Test: 116,323 images
   - Total: 814,255 images

2. **Image Properties**:
   - Dimensions: 28x28 pixels (grayscale)
   - Pixel values: 0-255 (uint8)
   - Generally good contrast with characters

3. **Class Distribution**:
   - 62 classes total (10 digits + 26 uppercase + 26 lowercase)
   - Imbalanced distribution (some classes have 20x more samples than others)
   - May need to consider class weights during training

4. **Data Quality**:
   - Very few completely black/white images
   - Most images have sufficient variance (visible characters)
   - All 62 classes are represented

5. **Recommendations for Model Training**:
   - **Normalization**: Scale pixel values to [0, 1] range
   - **Class Imbalance**: Consider using class weights or data augmentation
   - **Data Augmentation**: Apply rotation, shift, zoom to increase robustness
   - **Validation Split**: Use 15% of training data for validation
   - **Target Accuracy**: Aim for ≥85% given the dataset size and quality