# Manual Image Inspection - Find Mislabeled Data
**Following the archaeologist hint: Inspect artifacts to find wrong labels**

## What to look for:
1. **Wrong numerals** - e.g., "V" image in the "III" folder
2. **Blank/corrupted images**
3. **Images that look completely different** from others in the class
4. **Ambiguous or unclear numerals**

## Instructions:
1. Upload dataset.zip
2. Run cells to generate visual grids
3. Carefully inspect each class
4. Note down filenames of mislabeled images
5. Delete them and retrain

In [None]:
from google.colab import files
import zipfile
import os

print("Upload dataset.zip:")
uploaded = files.upload()

for filename in uploaded.keys():
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall('.')

print("✓ Extracted")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import random
from IPython.display import display, HTML

random.seed(42)

## Inspect Each Class - Look for Mislabeled Images

In [None]:
def inspect_class_interactive(class_path, class_name, n_samples=100):
    """Display images from a class for manual inspection"""
    images = [f for f in os.listdir(class_path) if not f.startswith('.')]
    
    print(f"\n{'='*70}")
    print(f"CLASS: {class_name.upper()} ({len(images)} total images)")
    print(f"{'='*70}")
    print("\nLook for images that DON'T match this numeral!\n")
    
    # Show in batches
    samples = random.sample(images, min(n_samples, len(images)))
    
    batch_size = 30
    for batch_start in range(0, len(samples), batch_size):
        batch = samples[batch_start:batch_start + batch_size]
        
        # Create grid
        n_cols = 10
        n_rows = (len(batch) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 2*n_rows))
        if n_rows == 1:
            axes = axes.reshape(1, -1)
        
        axes = axes.flatten()
        
        for idx, img_name in enumerate(batch):
            img_path = os.path.join(class_path, img_name)
            try:
                img = Image.open(img_path)
                axes[idx].imshow(img, cmap='gray')
                # Show filename in small text
                axes[idx].set_title(img_name[:12], fontsize=6)
                axes[idx].axis('off')
            except:
                axes[idx].text(0.5, 0.5, 'ERROR', ha='center', va='center')
                axes[idx].axis('off')
        
        # Hide unused
        for idx in range(len(batch), len(axes)):
            axes[idx].axis('off')
        
        plt.suptitle(f'Class {class_name.upper()} - Batch {batch_start//batch_size + 1}',
                     fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()

# Inspect all classes
base_path = 'dataset/train'
classes = sorted([d for d in os.listdir(base_path) 
                 if os.path.isdir(os.path.join(base_path, d)) and not d.startswith('.')])

print("\n" + "="*70)
print("STARTING MANUAL INSPECTION")
print("="*70)
print("\nInstructions:")
print("  1. Look at each grid carefully")
print("  2. Find images that clearly show the WRONG numeral")
print("  3. Note down their filenames (shown in image titles)")
print("  4. We'll delete them in the next step\n")

for class_name in classes:
    class_path = os.path.join(base_path, class_name)
    inspect_class_interactive(class_path, class_name, n_samples=100)

## Side-by-Side Comparison
See all classes together to spot inconsistencies

In [None]:
def compare_all_classes(base_path='dataset/train', n_per_class=25):
    """Show all classes side by side"""
    classes = sorted([d for d in os.listdir(base_path)
                     if os.path.isdir(os.path.join(base_path, d)) and not d.startswith('.')])
    
    fig, axes = plt.subplots(len(classes), n_per_class, figsize=(25, 2.5*len(classes)))
    
    for i, class_name in enumerate(classes):
        class_path = os.path.join(base_path, class_name)
        images = [f for f in os.listdir(class_path) if not f.startswith('.')]
        samples = random.sample(images, min(n_per_class, len(images)))
        
        for j, img_name in enumerate(samples):
            img_path = os.path.join(class_path, img_name)
            try:
                img = Image.open(img_path)
                axes[i, j].imshow(img, cmap='gray')
                axes[i, j].axis('off')
            except:
                axes[i, j].axis('off')
        
        # Label rows
        axes[i, 0].text(-5, 16, class_name.upper(),
                       fontsize=16, fontweight='bold',
                       ha='right', va='center',
                       bbox=dict(boxstyle='round', facecolor='yellow', alpha=0.8))
        
        # Hide unused
        for j in range(len(samples), n_per_class):
            axes[i, j].axis('off')
    
    plt.suptitle('All Classes - Look for images that don\'t match their row!',
                 fontsize=16, fontweight='bold', y=0.995)
    plt.tight_layout()
    plt.show()

print("\n" + "="*70)
print("SIDE-BY-SIDE COMPARISON")
print("="*70)
print("Look for images that clearly don't match their row label\n")

compare_all_classes('dataset/train', n_per_class=25)

## Analyze Class Confusion
Check which classes look most similar (likely sources of mislabeling)

In [None]:
print("\n" + "="*70)
print("LIKELY MISLABELING PATTERNS")
print("="*70)
print("\nThese numerals might be confused with each other:")
print("\n  I (1 line)    ↔  II (2 lines) - check for extra/missing lines")
print("  II (2 lines)  ↔  III (3 lines) - count lines carefully")
print("  III (3 lines) ↔  VIII (8) - VIII has V + III")
print("  IV (4)        ↔  VI (6) - check orientation of V")
print("  V (5)         ↔  X (10) - X is two V's")
print("  VI (6)        ↔  VII (7) - count I's after V")
print("  VII (7)       ↔  VIII (8) - check if it's V or VIII base")
print("  IX (9)        ↔  XI - but XI not in dataset")

print("\n" + "="*70)
print("WHAT TO DO NOW:")
print("="*70)
print("\n1. From the grids above, identify filenames of mislabeled images")
print("2. Write them down or take screenshots")
print("3. In the next cell, we'll delete them")
print("\nExample: If you see 'III' in the 'ii' folder, that's mislabeled!")

## Delete Mislabeled Images
**Enter the filenames you identified above**

In [None]:
# List mislabeled images you found
# Format: {"class_folder": ["filename1.png", "filename2.png"]}

mislabeled_images = {
    # Example:
    # "i": ["ac123.png", "ac456.png"],
    # "ii": ["ab789.png"],
    # Add your findings here
}

# Delete mislabeled images
import shutil

deleted_count = 0
backup_dir = 'mislabeled_backup'
os.makedirs(backup_dir, exist_ok=True)

for class_name, img_names in mislabeled_images.items():
    class_path = os.path.join('dataset/train', class_name)
    
    for img_name in img_names:
        img_path = os.path.join(class_path, img_name)
        
        if os.path.exists(img_path):
            # Backup first
            backup_class = os.path.join(backup_dir, class_name)
            os.makedirs(backup_class, exist_ok=True)
            shutil.copy(img_path, backup_class)
            
            # Delete
            os.remove(img_path)
            deleted_count += 1
            print(f"Deleted: {class_name}/{img_name}")
        else:
            print(f"Not found: {class_name}/{img_name}")

print(f"\n✓ Deleted {deleted_count} mislabeled images")
print(f"✓ Backups saved to: {backup_dir}/")

if deleted_count == 0:
    print("\n⚠️  No images specified! Add mislabeled filenames to the dictionary above.")

## Quick Statistics After Cleaning

In [None]:
def count_images(base_path):
    counts = {}
    for class_name in sorted(os.listdir(base_path)):
        class_path = os.path.join(base_path, class_name)
        if os.path.isdir(class_path) and not class_name.startswith('.'):
            counts[class_name] = len([f for f in os.listdir(class_path) if not f.startswith('.')])
    return counts

print("\n" + "="*70)
print("CLEANED DATASET STATISTICS")
print("="*70)

train_counts = count_images('dataset/train')
for class_name, count in train_counts.items():
    print(f"  {class_name}: {count}")
print(f"  Total: {sum(train_counts.values())}")

print("\n✓ Now proceed to training with the cleaned dataset!")

## Next: Train with Cleaned Data

After removing mislabeled images:
1. Use the cleaned dataset for training
2. Apply minimal augmentation (cleaned data doesn't need much)
3. The overfitting gap should reduce significantly!

The key insight: **Wrong labels cause overfitting** because the model memorizes noise in training data that doesn't exist in validation data.