# Roman Numeral Recognition - FINAL IMPROVED SOLUTION
**Aggressive cleaning + Enhanced augmentation + Fixed errors**

## Instructions:
1. Upload to Google Colab
2. Runtime â†’ Change runtime type â†’ GPU (T4)
3. Upload dataset.zip
4. Run all cells
5. Download trained weights

## Improvements:
- More aggressive outlier removal (60% vs 30%)
- Better augmentation (9 types vs 5)
- More training images (300 per class vs 280)
- Fixed filename error for weights saving

In [None]:
# Upload dataset
from google.colab import files
import zipfile
import os

print("Upload your dataset.zip file:")
uploaded = files.upload()

for filename in uploaded.keys():
    print(f"Extracting {filename}...")
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall('.')

print("\nâœ“ Dataset extracted!")

In [None]:
# Imports
import shutil
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageEnhance, ImageFilter, ImageOps
import random
import tensorflow as tf
from tensorflow import keras

random.seed(42)
np.random.seed(42)
tf.random.set_seed(123)

print("TensorFlow:", tf.__version__)
print("GPU:", tf.config.list_physical_devices('GPU'))

## Enhanced Data Cleaning Functions

In [None]:
def count_images(base_path):
    counts = {}
    for class_name in sorted(os.listdir(base_path)):
        class_path = os.path.join(base_path, class_name)
        if os.path.isdir(class_path) and not class_name.startswith('.'):
            images = [f for f in os.listdir(class_path) if not f.startswith('.')]
            counts[class_name] = len(images)
    return counts

def get_advanced_features(img_path):
    try:
        img = Image.open(img_path).convert('L')
        img_array = np.array(img)
        return [
            np.mean(img_array),
            np.std(img_array),
            np.median(img_array),
            np.percentile(img_array, 25),
            np.percentile(img_array, 75),
            np.mean(np.abs(np.diff(img_array))),  # edge intensity
            np.max(img_array) - np.min(img_array)  # contrast
        ]
    except:
        return None

def find_outliers_advanced(base_path):
    """Aggressive outlier detection using tighter IQR bounds"""
    outliers_info = {}
    
    for class_name in sorted(os.listdir(base_path)):
        class_path = os.path.join(base_path, class_name)
        if not os.path.isdir(class_path) or class_name.startswith('.'):
            continue
        
        images = [f for f in os.listdir(class_path) if not f.startswith('.')]
        features = []
        image_paths = []
        
        for img_name in images:
            img_path = os.path.join(class_path, img_name)
            feat = get_advanced_features(img_path)
            if feat is not None:
                features.append(feat)
                image_paths.append(img_path)
        
        features = np.array(features)
        Q1 = np.percentile(features, 25, axis=0)
        Q3 = np.percentile(features, 75, axis=0)
        IQR = Q3 - Q1
        
        # Tighter bounds: 1.0 * IQR instead of 1.5
        outlier_mask = np.any(
            (features < (Q1 - 1.0 * IQR)) | (features > (Q3 + 1.0 * IQR)),
            axis=1
        )
        outlier_indices = np.where(outlier_mask)[0]
        outliers_info[class_name] = [image_paths[i] for i in outlier_indices]
        
        print(f"Class {class_name}: {len(outlier_indices)} outliers / {len(images)}")
    
    return outliers_info

In [None]:
def create_cleaned_dataset(base_path, outliers_dict, output_path, removal_rate=0.6):
    """Remove 60% of detected outliers"""
    os.makedirs(output_path, exist_ok=True)
    removed = 0
    kept = 0
    
    for class_name in sorted(os.listdir(base_path)):
        class_path = os.path.join(base_path, class_name)
        if not os.path.isdir(class_path) or class_name.startswith('.'):
            continue
        
        output_class_path = os.path.join(output_path, class_name)
        os.makedirs(output_class_path, exist_ok=True)
        outlier_set = set(outliers_dict.get(class_name, []))
        
        for img_name in os.listdir(class_path):
            if img_name.startswith('.'):
                continue
            img_path = os.path.join(class_path, img_name)
            
            if img_path in outlier_set and random.random() < removal_rate:
                removed += 1
                continue
            
            shutil.copy(img_path, output_class_path)
            kept += 1
    
    print(f"\nCleaned: Kept {kept}, Removed {removed}")
    return output_path

## Enhanced Augmentation

In [None]:
def augment_image_improved(img, aug_type):
    """9 diverse augmentation types"""
    if aug_type == 'rotate_small':
        return img.rotate(random.randint(-20, 20), fillcolor=255)
    elif aug_type == 'rotate_medium':
        return img.rotate(random.choice([-10, -5, 5, 10]), fillcolor=255)
    elif aug_type == 'brightness':
        return ImageEnhance.Brightness(img).enhance(random.uniform(0.6, 1.4))
    elif aug_type == 'contrast':
        return ImageEnhance.Contrast(img).enhance(random.uniform(0.7, 1.4))
    elif aug_type == 'sharpness':
        return ImageEnhance.Sharpness(img).enhance(random.uniform(0.5, 2.0))
    elif aug_type == 'blur':
        return img.filter(ImageFilter.GaussianBlur(radius=random.uniform(0.3, 2.0)))
    elif aug_type == 'shift':
        shift_x, shift_y = random.randint(-4, 4), random.randint(-4, 4)
        return img.transform(img.size, Image.AFFINE, (1, 0, shift_x, 0, 1, shift_y), fillcolor=255)
    elif aug_type == 'zoom':
        scale = random.uniform(0.9, 1.1)
        w, h = img.size
        new_w, new_h = int(w * scale), int(h * scale)
        resized = img.resize((new_w, new_h), Image.LANCZOS)
        if scale > 1:
            left, top = (new_w - w) // 2, (new_h - h) // 2
            return resized.crop((left, top, left + w, top + h))
        else:
            new_img = Image.new(img.mode, (w, h), 255)
            new_img.paste(resized, ((w - new_w) // 2, (h - new_h) // 2))
            return new_img
    elif aug_type == 'combined':
        img = augment_image_improved(img, 'rotate_small')
        img = augment_image_improved(img, random.choice(['brightness', 'contrast']))
        return img
    return img

def balance_and_augment_improved(cleaned_path, output_path, target_per_class=300):
    os.makedirs(output_path, exist_ok=True)
    aug_types = ['rotate_small', 'rotate_medium', 'brightness', 'contrast',
                 'sharpness', 'blur', 'shift', 'zoom', 'combined']
    
    for class_name in sorted(os.listdir(cleaned_path)):
        class_path = os.path.join(cleaned_path, class_name)
        if not os.path.isdir(class_path) or class_name.startswith('.'):
            continue
        
        output_class_path = os.path.join(output_path, class_name)
        os.makedirs(output_class_path, exist_ok=True)
        images = [f for f in os.listdir(class_path) if not f.startswith('.')]
        
        # Copy originals
        for img_name in images:
            shutil.copy(os.path.join(class_path, img_name), output_class_path)
        
        # Augment
        needed = target_per_class - len(images)
        if needed > 0:
            for i in range(needed):
                img_name = random.choice(images)
                img = Image.open(os.path.join(class_path, img_name))
                aug_img = augment_image_improved(img, random.choice(aug_types))
                aug_img.save(os.path.join(output_class_path, f"aug_{i}_{img_name}"))
        
        print(f"Class {class_name}: {len(images)} â†’ {target_per_class} (+{needed})")

## Data Preparation Pipeline

In [None]:
print("\n" + "="*60)
print("STEP 1: Detect Outliers")
print("="*60)
outliers = find_outliers_advanced('dataset/train')

print("\n" + "="*60)
print("STEP 2: Clean Dataset (Remove 60% of outliers)")
print("="*60)
cleaned = create_cleaned_dataset('dataset/train', outliers, 'cleaned/train', removal_rate=0.6)

print("\n" + "="*60)
print("STEP 3: Balance & Augment (Target: 300 per class)")
print("="*60)
balance_and_augment_improved(cleaned, 'augmented/train', target_per_class=300)

print("\n" + "="*60)
print("STEP 4: Copy Validation Set")
print("="*60)
shutil.copytree('dataset/val', 'augmented/val', dirs_exist_ok=True)

# Create data_original
if os.path.exists('data_original'):
    shutil.rmtree('data_original')
shutil.copytree('augmented', 'data_original')

print("\nâœ“ Data preparation complete!")
print(f"Train: {sum(count_images('data_original/train').values())}")
print(f"Val: {sum(count_images('data_original/val').values())}")

## Model Training

In [None]:
# Load datasets
batch_size = 8
directory = "./data_original"

train = tf.keras.preprocessing.image_dataset_from_directory(
    directory + "/train",
    labels="inferred",
    label_mode="categorical",
    class_names=["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"],
    shuffle=True,
    seed=123,
    batch_size=batch_size,
    image_size=(32, 32),
)

valid = tf.keras.preprocessing.image_dataset_from_directory(
    directory + "/val",
    labels="inferred",
    label_mode="categorical",
    class_names=["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"],
    shuffle=True,
    seed=123,
    batch_size=batch_size,
    image_size=(32, 32),
)

print(f"Training batches: {train.cardinality().numpy()}")
print(f"Validation batches: {valid.cardinality().numpy()}")

In [None]:
# Build model
base_model = tf.keras.applications.ResNet50(
    input_shape=(32, 32, 3),
    include_top=False,
    weights=None,
)
base_model = tf.keras.Model(
    base_model.inputs, outputs=[base_model.get_layer("conv2_block3_out").output]
)

inputs = tf.keras.Input(shape=(32, 32, 3))
x = tf.keras.applications.resnet.preprocess_input(inputs)
x = base_model(x)
x = tf.keras.layers.GlobalAveragePooling2D()(x[0])
x = tf.keras.layers.Dense(10)(x)
model = tf.keras.Model(inputs, x)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

model.summary()

In [None]:
# Train with early stopping
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "best_model.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    patience=10,
    restore_best_weights=True,
    verbose=1
)

print("\n" + "="*60)
print("TRAINING STARTED")
print("="*60 + "\n")

history = model.fit(
    train,
    validation_data=valid,
    epochs=40,  # Increased from 30
    callbacks=[checkpoint, early_stop],
)

# Load best weights
model.load_weights("best_model.weights.h5")

# Final evaluation
loss, acc = model.evaluate(valid)

print("\n" + "="*60)
print(f"FINAL VALIDATION ACCURACY: {acc*100:.2f}%")
print("="*60)

if acc >= 0.93:
    print("\nðŸŽ‰ BONUS! Achieved â‰¥93% accuracy!")
    print("Expected score: 100 points")
elif acc >= 0.90:
    score = 70 + ((acc - 0.90) / 0.03) * 30
    print(f"\nâœ“ SUCCESS! Achieved â‰¥90% accuracy!")
    print(f"Expected score: ~{score:.0f} points")
else:
    print(f"\nâš  Need {(0.90-acc)*100:.2f}% more to reach 90%")
    print("Try: increase removal_rate to 0.7, or target_per_class to 350")

# Save weights (FIXED: proper filename)
model.save_weights("submission.weights.h5")
print("\nWeights saved to: submission.weights.h5 and best_model.weights.h5")

In [None]:
# Plot training history
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train', linewidth=2)
plt.plot(history.history['val_accuracy'], label='Validation', linewidth=2)
plt.axhline(y=0.90, color='r', linestyle='--', label='90% Target', alpha=0.7)
plt.axhline(y=0.93, color='g', linestyle='--', label='93% Bonus', alpha=0.7)
plt.title('Model Accuracy', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train', linewidth=2)
plt.plot(history.history['val_loss'], label='Validation', linewidth=2)
plt.title('Model Loss', fontsize=14, fontweight='bold')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\nBest validation accuracy: {max(history.history['val_accuracy'])*100:.2f}%")
print(f"Final training accuracy: {history.history['accuracy'][-1]*100:.2f}%")

## Download Weights

In [None]:
# Download trained weights
from google.colab import files

print("Downloading weights...")
files.download('best_model.weights.h5')
files.download('submission.weights.h5')
print("\nâœ“ Download complete!")
print("Submit 'best_model.weights.h5' for evaluation")

## If Accuracy < 90%: Try These Adjustments

1. **More aggressive cleaning:** Change `removal_rate=0.6` to `0.7` or `0.8`
2. **More augmentation:** Change `target_per_class=300` to `350` or `400`
3. **More epochs:** Change `epochs=40` to `50` or `60`
4. **Manual inspection:** Download extreme_outliers and manually remove bad images

Then re-run cells 7-12 to retrain with adjusted parameters.