# EMERGENCY FIX - Overfitting Issue
**New Strategy: Less aggressive cleaning + Strong regularization + Model improvements**

## Problem:
- Training: 98%+
- Validation: 53-68%
- Gap: 30-45%

## Root Cause:
Too aggressive cleaning removed good data, leaving noisy samples that the model memorizes.

## New Approach:
1. **Minimal cleaning** (remove only extreme outliers)
2. **Moderate augmentation** (avoid unrealistic samples)
3. **Strong regularization** (dropout, L2 weight decay)
4. **Longer training** with early stopping

In [None]:
# Upload dataset
from google.colab import files
import zipfile
import os

print("Upload dataset.zip:")
uploaded = files.upload()

for filename in uploaded.keys():
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall('.')

print("âœ“ Extracted")

In [None]:
import shutil
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image, ImageEnhance, ImageFilter
import random
import tensorflow as tf
from tensorflow import keras

random.seed(42)
np.random.seed(42)
tf.random.set_seed(123)

print("TF:", tf.__version__)
print("GPU:", tf.config.list_physical_devices('GPU'))

## MINIMAL CLEANING - Remove only EXTREME outliers

In [None]:
def count_images(base_path):
    counts = {}
    for class_name in sorted(os.listdir(base_path)):
        class_path = os.path.join(base_path, class_name)
        if os.path.isdir(class_path) and not class_name.startswith('.'):
            counts[class_name] = len([f for f in os.listdir(class_path) if not f.startswith('.')])
    return counts

def get_image_stats(img_path):
    try:
        img = Image.open(img_path).convert('L')
        arr = np.array(img)
        return [np.mean(arr), np.std(arr)]
    except:
        return None

def find_extreme_outliers_only(base_path):
    """Only remove the MOST extreme outliers (top 5% worst)"""
    outliers_info = {}
    
    for class_name in sorted(os.listdir(base_path)):
        class_path = os.path.join(base_path, class_name)
        if not os.path.isdir(class_path) or class_name.startswith('.'):
            continue
        
        images = [f for f in os.listdir(class_path) if not f.startswith('.')]
        features = []
        paths = []
        
        for img_name in images:
            img_path = os.path.join(class_path, img_name)
            feat = get_image_stats(img_path)
            if feat:
                features.append(feat)
                paths.append(img_path)
        
        features = np.array(features)
        median = np.median(features, axis=0)
        
        # Calculate distance from median
        distances = np.sqrt(np.sum((features - median)**2, axis=1))
        
        # Only flag top 5% most extreme
        threshold = np.percentile(distances, 95)
        extreme_indices = np.where(distances > threshold)[0]
        
        outliers_info[class_name] = [paths[i] for i in extreme_indices]
        print(f"Class {class_name}: {len(extreme_indices)} extreme outliers (top 5%)")
    
    return outliers_info

print("Finding EXTREME outliers only...")
outliers = find_extreme_outliers_only('dataset/train')

In [None]:
def minimal_clean(base_path, outliers_dict, output_path):
    """Remove only extreme outliers (not all detected ones)"""
    os.makedirs(output_path, exist_ok=True)
    removed = 0
    kept = 0
    
    for class_name in sorted(os.listdir(base_path)):
        class_path = os.path.join(base_path, class_name)
        if not os.path.isdir(class_path) or class_name.startswith('.'):
            continue
        
        output_class_path = os.path.join(output_path, class_name)
        os.makedirs(output_class_path, exist_ok=True)
        outlier_set = set(outliers_dict.get(class_name, []))
        
        for img_name in os.listdir(class_path):
            if img_name.startswith('.'):
                continue
            img_path = os.path.join(class_path, img_name)
            
            # Remove ALL extreme outliers (they're already the worst 5%)
            if img_path in outlier_set:
                removed += 1
                continue
            
            shutil.copy(img_path, output_class_path)
            kept += 1
    
    print(f"\nMinimal cleaning: Kept {kept}, Removed {removed}")
    return output_path

cleaned = minimal_clean('dataset/train', outliers, 'cleaned/train')

## MODERATE AUGMENTATION - Realistic transforms only

In [None]:
def moderate_augment(img, aug_type):
    """Conservative augmentation - keep samples realistic"""
    if aug_type == 'rotate':
        return img.rotate(random.randint(-12, 12), fillcolor=255)
    elif aug_type == 'brightness':
        return ImageEnhance.Brightness(img).enhance(random.uniform(0.8, 1.2))
    elif aug_type == 'contrast':
        return ImageEnhance.Contrast(img).enhance(random.uniform(0.85, 1.15))
    elif aug_type == 'shift':
        shift_x, shift_y = random.randint(-2, 2), random.randint(-2, 2)
        return img.transform(img.size, Image.AFFINE, (1, 0, shift_x, 0, 1, shift_y), fillcolor=255)
    return img

def balance_moderate(cleaned_path, output_path, target=260):
    """Balance with moderate, realistic augmentation"""
    os.makedirs(output_path, exist_ok=True)
    aug_types = ['rotate', 'brightness', 'contrast', 'shift']
    
    for class_name in sorted(os.listdir(cleaned_path)):
        class_path = os.path.join(cleaned_path, class_name)
        if not os.path.isdir(class_path) or class_name.startswith('.'):
            continue
        
        output_class_path = os.path.join(output_path, class_name)
        os.makedirs(output_class_path, exist_ok=True)
        images = [f for f in os.listdir(class_path) if not f.startswith('.')]
        
        # Copy originals
        for img_name in images:
            shutil.copy(os.path.join(class_path, img_name), output_class_path)
        
        # Moderate augmentation
        needed = target - len(images)
        if needed > 0:
            for i in range(needed):
                img_name = random.choice(images)
                img = Image.open(os.path.join(class_path, img_name))
                aug_img = moderate_augment(img, random.choice(aug_types))
                aug_img.save(os.path.join(output_class_path, f"aug_{i}_{img_name}"))
        
        print(f"Class {class_name}: {len(images)} â†’ {target} (+{max(0, needed)})")

print("\nBalancing with MODERATE augmentation...")
balance_moderate(cleaned, 'augmented/train', target=260)

In [None]:
# Copy val and create data_original
shutil.copytree('dataset/val', 'augmented/val', dirs_exist_ok=True)
if os.path.exists('data_original'):
    shutil.rmtree('data_original')
shutil.copytree('augmented', 'data_original')

print("\nâœ“ Data ready")
print(f"Train: {sum(count_images('data_original/train').values())}")
print(f"Val: {sum(count_images('data_original/val').values())}")

## IMPROVED MODEL with Strong Regularization

In [None]:
# Load data
batch_size = 8

train = tf.keras.preprocessing.image_dataset_from_directory(
    "data_original/train",
    labels="inferred",
    label_mode="categorical",
    class_names=["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"],
    shuffle=True,
    seed=123,
    batch_size=batch_size,
    image_size=(32, 32),
)

valid = tf.keras.preprocessing.image_dataset_from_directory(
    "data_original/val",
    labels="inferred",
    label_mode="categorical",
    class_names=["i", "ii", "iii", "iv", "v", "vi", "vii", "viii", "ix", "x"],
    shuffle=True,
    seed=123,
    batch_size=batch_size,
    image_size=(32, 32),
)

print(f"Batches - Train: {train.cardinality().numpy()}, Val: {valid.cardinality().numpy()}")

In [None]:
# Model with STRONG REGULARIZATION
base_model = tf.keras.applications.ResNet50(
    input_shape=(32, 32, 3),
    include_top=False,
    weights=None,
)
base_model = tf.keras.Model(
    base_model.inputs, outputs=[base_model.get_layer("conv2_block3_out").output]
)

inputs = tf.keras.Input(shape=(32, 32, 3))
x = tf.keras.applications.resnet.preprocess_input(inputs)
x = base_model(x)
x = tf.keras.layers.GlobalAveragePooling2D()(x[0])

# ADD STRONG REGULARIZATION
x = tf.keras.layers.Dropout(0.5)(x)  # Drop 50% of neurons
x = tf.keras.layers.Dense(128, kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)  # L2 regularization
x = tf.keras.layers.BatchNormalization()(x)
x = tf.keras.layers.Activation('relu')(x)
x = tf.keras.layers.Dropout(0.4)(x)  # Another dropout
x = tf.keras.layers.Dense(10)(x)  # Output

model = tf.keras.Model(inputs, x)

# Use learning rate schedule
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=0.001,
    decay_steps=1000,
    decay_rate=0.9
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=lr_schedule),
    loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

print("\nModel with strong regularization:")
model.summary()

In [None]:
# Train with early stopping
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "best_model.weights.h5",
    monitor="val_accuracy",
    mode="max",
    save_best_only=True,
    save_weights_only=True,
    verbose=1
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_accuracy",
    patience=15,
    restore_best_weights=True,
    verbose=1
)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_accuracy',
    factor=0.5,
    patience=5,
    min_lr=1e-6,
    verbose=1
)

print("\n" + "="*60)
print("TRAINING with Regularization")
print("="*60)

history = model.fit(
    train,
    validation_data=valid,
    epochs=60,
    callbacks=[checkpoint, early_stop, reduce_lr],
    verbose=1
)

model.load_weights("best_model.weights.h5")
loss, acc = model.evaluate(valid)

print("\n" + "="*60)
print(f"FINAL VALIDATION ACCURACY: {acc*100:.2f}%")
print(f"FINAL TRAINING ACCURACY: {history.history['accuracy'][-1]*100:.2f}%")
print(f"OVERFITTING GAP: {(history.history['accuracy'][-1] - acc)*100:.2f}%")
print("="*60)

if acc >= 0.93:
    print("\nðŸŽ‰ BONUS! â‰¥93%")
elif acc >= 0.90:
    print(f"\nâœ“ SUCCESS! â‰¥90%")
else:
    print(f"\nâš  Need {(0.90-acc)*100:.2f}% more")

model.save_weights("submission.weights.h5")
print("\nSaved: submission.weights.h5")

In [None]:
# Plot
plt.figure(figsize=(14, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train', linewidth=2)
plt.plot(history.history['val_accuracy'], label='Val', linewidth=2)
plt.axhline(y=0.90, color='r', linestyle='--', alpha=0.7)
plt.title('Accuracy - Overfitting Check', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train', linewidth=2)
plt.plot(history.history['val_loss'], label='Val', linewidth=2)
plt.title('Loss', fontsize=14, fontweight='bold')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Show gap over time
gap = [train_acc - val_acc for train_acc, val_acc in zip(history.history['accuracy'], history.history['val_accuracy'])]
plt.figure(figsize=(10, 4))
plt.plot(gap, linewidth=2, color='red')
plt.title('Overfitting Gap (Train - Val)', fontsize=14, fontweight='bold')
plt.ylabel('Accuracy Gap')
plt.xlabel('Epoch')
plt.axhline(y=0.1, color='orange', linestyle='--', label='10% gap', alpha=0.7)
plt.axhline(y=0.2, color='red', linestyle='--', label='20% gap', alpha=0.7)
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"\nFinal gap: {gap[-1]*100:.2f}%")
if gap[-1] < 0.15:
    print("âœ“ Good generalization!")
else:
    print("âš  Still overfitting - consider more regularization")

In [None]:
# Download
from google.colab import files
files.download('best_model.weights.h5')
files.download('submission.weights.h5')
print("âœ“ Downloaded")