# EMNIST Data Preprocessing Pipeline

This notebook demonstrates the complete data preprocessing pipeline for the EMNIST character recognition system.

**Key Operations:**
1. **Normalization**: Scale pixel values from [0, 255] to [0, 1]
2. **Reshaping**: Add channel dimension for CNN input
3. **One-Hot Encoding**: Convert labels to categorical format
4. **Train/Validation Split**: 85/15 split with stratification
5. **Data Augmentation**: Rotation, shifts, and zoom transformations

## 1. Import Libraries and Load Data

In [None]:
import sys
sys.path.append('..')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Import our custom modules
from src.data.dataset import load_emnist
from src.utils.label_mapping import load_label_mapping
from src.preprocessing.preprocessing import (
    normalize_images,
    reshape_images,
    one_hot_encode_labels,
    preprocess_data,
    create_train_val_split,
    create_data_augmentation_generator,
    visualize_augmentation
)

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("âœ“ Libraries imported successfully")

# Load the dataset
print("\nLoading EMNIST ByClass dataset...")
x_train, y_train, x_test, y_test = load_emnist()
label_mapping = load_label_mapping()

print(f"âœ“ Dataset loaded:")
print(f"  Training: {x_train.shape[0]:,} samples")
print(f"  Test: {x_test.shape[0]:,} samples")
print(f"  Image size: {x_train.shape[1]}x{x_train.shape[2]}")
print(f"  Classes: {len(np.unique(y_train))}")

## 2. Normalization

Normalize pixel values from [0, 255] to [0, 1] for better neural network training.

In [None]:
# Normalize the images
x_train_norm = normalize_images(x_train)
x_test_norm = normalize_images(x_test)

print("Before normalization:")
print(f"  Data type: {x_train.dtype}")
print(f"  Value range: [{x_train.min()}, {x_train.max()}]")
print(f"  Mean: {x_train.mean():.2f}")
print(f"  Std: {x_train.std():.2f}")

print("\nAfter normalization:")
print(f"  Data type: {x_train_norm.dtype}")
print(f"  Value range: [{x_train_norm.min():.4f}, {x_train_norm.max():.4f}]")
print(f"  Mean: {x_train_norm.mean():.4f}")
print(f"  Std: {x_train_norm.std():.4f}")

# Visualize the effect
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Original
axes[0].imshow(x_train[0], cmap='gray', vmin=0, vmax=255)
axes[0].set_title(f"Original (range: [0, 255])\nCharacter: '{label_mapping[y_train[0]]}'", fontsize=12)
axes[0].axis('off')

# Normalized
axes[1].imshow(x_train_norm[0], cmap='gray', vmin=0, vmax=1)
axes[1].set_title(f"Normalized (range: [0, 1])\nCharacter: '{label_mapping[y_train[0]]}'", fontsize=12)
axes[1].axis('off')

plt.tight_layout()
plt.show()

print("\nâœ“ Normalization complete")

## 3. Reshaping

Add channel dimension to images for CNN input (n_samples, height, width, channels).

In [None]:
# Reshape to add channel dimension
x_train_reshaped = reshape_images(x_train_norm)
x_test_reshaped = reshape_images(x_test_norm)

print("Before reshaping:")
print(f"  Training shape: {x_train_norm.shape}")
print(f"  Test shape: {x_test_norm.shape}")

print("\nAfter reshaping:")
print(f"  Training shape: {x_train_reshaped.shape}")
print(f"  Test shape: {x_test_reshaped.shape}")

print("\nâœ“ Reshaping complete - ready for CNN input")

## 4. One-Hot Encoding

Convert integer labels to categorical one-hot encoded vectors for multi-class classification.

In [None]:
# One-hot encode the labels
y_train_encoded = one_hot_encode_labels(y_train, num_classes=62)
y_test_encoded = one_hot_encode_labels(y_test, num_classes=62)

print("Before one-hot encoding:")
print(f"  Training labels shape: {y_train.shape}")
print(f"  Test labels shape: {y_test.shape}")
print(f"  Sample label: {y_train[0]} (character '{label_mapping[y_train[0]]}')")

print("\nAfter one-hot encoding:")
print(f"  Training labels shape: {y_train_encoded.shape}")
print(f"  Test labels shape: {y_test_encoded.shape}")
print(f"  Sample encoded label shape: {y_train_encoded[0].shape}")
print(f"  Sum of encoded vector: {y_train_encoded[0].sum()}")

# Visualize one-hot encoding
fig, ax = plt.subplots(figsize=(15, 3))
sample_idx = 0
sample_label = y_train[sample_idx]
sample_encoded = y_train_encoded[sample_idx]

ax.bar(range(62), sample_encoded, color='steelblue', edgecolor='navy', alpha=0.7)
ax.axvline(sample_label, color='red', linestyle='--', linewidth=2, label=f"Class {sample_label} ('{label_mapping[sample_label]}')")
ax.set_xlabel('Class Index', fontsize=12)
ax.set_ylabel('Value', fontsize=12)
ax.set_title(f"One-Hot Encoding Example - Character '{label_mapping[sample_label]}'", fontsize=14)
ax.legend()
ax.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()

print("\nâœ“ One-hot encoding complete")

## 5. Train/Validation Split

Split the training data into 85% training and 15% validation with stratification to maintain class balance.

In [None]:
# Create train/validation split (85/15)
x_train_split, x_val, y_train_split, y_val = create_train_val_split(
    x_train_reshaped,
    y_train_encoded,
    val_size=0.15,
    random_state=42,
    stratify=True
)

print("Dataset split:")
print(f"  Original training: {x_train_reshaped.shape[0]:,} samples")
print(f"  New training: {x_train_split.shape[0]:,} samples ({x_train_split.shape[0]/x_train_reshaped.shape[0]*100:.1f}%)")
print(f"  Validation: {x_val.shape[0]:,} samples ({x_val.shape[0]/x_train_reshaped.shape[0]*100:.1f}%)")
print(f"  Test: {x_test_reshaped.shape[0]:,} samples")

# Verify class distribution is maintained
y_train_classes = np.argmax(y_train_split, axis=1)
y_val_classes = np.argmax(y_val, axis=1)

train_dist = np.bincount(y_train_classes, minlength=62) / len(y_train_classes)
val_dist = np.bincount(y_val_classes, minlength=62) / len(y_val_classes)

# Plot class distributions
fig, axes = plt.subplots(2, 1, figsize=(18, 8))

axes[0].bar(range(62), train_dist, color='skyblue', alpha=0.7, label='Training Set')
axes[0].set_ylabel('Proportion', fontsize=11)
axes[0].set_title('Class Distribution - Training Set', fontsize=13)
axes[0].grid(axis='y', alpha=0.3)
axes[0].legend()

axes[1].bar(range(62), val_dist, color='coral', alpha=0.7, label='Validation Set')
axes[1].set_xlabel('Class Index', fontsize=11)
axes[1].set_ylabel('Proportion', fontsize=11)
axes[1].set_title('Class Distribution - Validation Set', fontsize=13)
axes[1].grid(axis='y', alpha=0.3)
axes[1].legend()

plt.tight_layout()
plt.show()

# Calculate correlation between distributions
correlation = np.corrcoef(train_dist, val_dist)[0, 1]
print(f"\nClass distribution correlation: {correlation:.4f}")
print("âœ“ Train/validation split complete - distributions are balanced" if correlation > 0.99 else "âš  Warning: distributions may be imbalanced")

## 6. Data Augmentation

Configure and demonstrate data augmentation with rotation (Â±15Â°), shifts (Â±10%), and zoom.

In [None]:
# Create data augmentation generator
datagen = create_data_augmentation_generator(
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    fill_mode='nearest'
)

print("Data Augmentation Configuration:")
print("  Rotation range: Â±15Â°")
print("  Width shift range: Â±10%")
print("  Height shift range: Â±10%")
print("  Zoom range: Â±10%")
print("  Fill mode: nearest")

# Visualize augmentation on sample images
sample_indices = [0, 100, 200]  # Different character samples
num_augmented = 9

fig, axes = plt.subplots(len(sample_indices), num_augmented + 1, figsize=(20, 3 * len(sample_indices)))
fig.suptitle('Data Augmentation Examples', fontsize=16, y=0.98)

for row_idx, sample_idx in enumerate(sample_indices):
    original_image = x_train_split[sample_idx]
    character = label_mapping[np.argmax(y_train_split[sample_idx])]
    
    # Show original
    axes[row_idx, 0].imshow(original_image[:, :, 0], cmap='gray')
    axes[row_idx, 0].set_title(f"Original\n'{character}'", fontsize=11)
    axes[row_idx, 0].axis('off')
    axes[row_idx, 0].set_facecolor('#f0f0f0')
    
    # Generate and show augmented samples
    augmented_samples = visualize_augmentation(original_image, datagen, num_samples=num_augmented)
    
    for col_idx, aug_image in enumerate(augmented_samples):
        axes[row_idx, col_idx + 1].imshow(aug_image[:, :, 0], cmap='gray')
        axes[row_idx, col_idx + 1].set_title(f"Aug {col_idx + 1}", fontsize=10)
        axes[row_idx, col_idx + 1].axis('off')

plt.tight_layout()
plt.show()

print("\nâœ“ Data augmentation configured and visualized")

## 7. Validation of Augmented Data

Verify that augmented images maintain valid pixel ranges and shapes.

In [None]:
# Generate a batch of augmented data for validation
sample_batch_size = 100
sample_batch = x_train_split[:sample_batch_size]

augmented_batch = []
for batch in datagen.flow(sample_batch, batch_size=sample_batch_size, shuffle=False):
    augmented_batch.append(batch)
    break  # Only need one batch

augmented_batch = augmented_batch[0]

print("Augmented Data Validation:")
print(f"  Batch size: {augmented_batch.shape[0]}")
print(f"  Image shape: {augmented_batch.shape[1:]}")
print(f"  Pixel value range: [{augmented_batch.min():.4f}, {augmented_batch.max():.4f}]")
print(f"  Mean: {augmented_batch.mean():.4f}")
print(f"  Std: {augmented_batch.std():.4f}")

# Check for any invalid values
has_nan = np.isnan(augmented_batch).any()
has_inf = np.isinf(augmented_batch).any()
in_valid_range = (augmented_batch.min() >= 0.0) and (augmented_batch.max() <= 1.0)

print(f"\nData Quality Checks:")
print(f"  Contains NaN: {has_nan}")
print(f"  Contains Inf: {has_inf}")
print(f"  Pixels in [0, 1]: {in_valid_range}")

if not has_nan and not has_inf and in_valid_range:
    print("\nâœ“ All augmented data is valid!")
else:
    print("\nâš  Warning: Augmented data may have issues")

## 8. Summary and Next Steps

### Preprocessing Pipeline Summary:

1. âœ… **Normalization**: Scaled pixel values from [0, 255] to [0, 1]
2. âœ… **Reshaping**: Added channel dimension (28, 28) â†’ (28, 28, 1)
3. âœ… **One-Hot Encoding**: Converted labels to 62-class categorical vectors
4. âœ… **Train/Val Split**: Created 85/15 split with stratification
5. âœ… **Data Augmentation**: Configured rotation (Â±15Â°), shifts (Â±10%), zoom (Â±10%)

### Final Dataset Shapes:

- **Training Set**: {:,} samples
- **Validation Set**: {:,} samples
- **Test Set**: {:,} samples

### Key Statistics:

- **Pixel Value Range**: [0.0, 1.0] âœ“
- **Image Shape**: (28, 28, 1) âœ“
- **Label Shape**: (62,) one-hot encoded âœ“
- **Class Distribution**: Balanced across train/val splits âœ“
- **Augmented Data**: All transformations produce valid images âœ“

### Next Phase: Model Development

The preprocessing pipeline is complete. Data is ready for:
- CNN model architecture design
- Model training with data augmentation
- Performance evaluation on validation set

In [None]:
# Display final statistics
print("="*60)
print("PREPROCESSING PIPELINE COMPLETE")
print("="*60)

print(f"\nðŸ“Š Final Dataset Shapes:")
print(f"   Training:   {x_train_split.shape[0]:>8,} samples â†’ {x_train_split.shape}")
print(f"   Validation: {x_val.shape[0]:>8,} samples â†’ {x_val.shape}")
print(f"   Test:       {x_test_reshaped.shape[0]:>8,} samples â†’ {x_test_reshaped.shape}")

print(f"\nðŸ“Š Label Shapes:")
print(f"   Training labels:   {y_train_split.shape}")
print(f"   Validation labels: {y_val.shape}")
print(f"   Test labels:       {y_test_encoded.shape}")

print(f"\nâœ… Data Quality:")
print(f"   Pixel range:       [{x_train_split.min():.4f}, {x_train_split.max():.4f}]")
print(f"   Mean pixel value:  {x_train_split.mean():.4f}")
print(f"   No NaN values:     {not np.isnan(x_train_split).any()}")
print(f"   No Inf values:     {not np.isinf(x_train_split).any()}")

print(f"\nðŸ”„ Augmentation Ready:")
print(f"   Generator configured: âœ“")
print(f"   Rotation: Â±15Â°")
print(f"   Shifts: Â±10%")
print(f"   Zoom: Â±10%")

print("\n" + "="*60)
print("Ready for Phase 4: Model Development")
print("="*60)