In [1]:
import os
import numpy as np
from PIL import Image
from scipy.ndimage import rotate, shift, zoom
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.base import clone

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohamedgamal07/reduced-mnist")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/mohamedgamal07/reduced-mnist?dataset_version_number=1...


100%|██████████| 9.62M/9.62M [00:00<00:00, 90.0MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/mohamedgamal07/reduced-mnist/versions/1


In [3]:
# --------------------------
# Data Loading Functions
# --------------------------
def load_data(train_dir, test_dir):
    """Load MNIST data from directories"""
    # Load training data
    images, labels = [], []
    for label in sorted(os.listdir(train_dir)):
        label_dir = os.path.join(train_dir, label)
        if os.path.isdir(label_dir):
            for img_name in os.listdir(label_dir):
                img_path = os.path.join(label_dir, img_name)
                with Image.open(img_path) as img:
                    img_array = np.array(img.convert('L'), dtype=np.float32) / 255.0
                    images.append(img_array)
                    labels.append(int(label))

    # Load test data
    test_images, test_labels = [], []
    for label in sorted(os.listdir(test_dir)):
        label_dir = os.path.join(test_dir, label)
        if os.path.isdir(label_dir):
            for img_name in os.listdir(label_dir):
                img_path = os.path.join(label_dir, img_name)
                with Image.open(img_path) as img:
                    img_array = np.array(img.convert('L'), dtype=np.float32) / 255.0
                    test_images.append(img_array)
                    test_labels.append(int(label))

    return (
        np.array(images), np.array(labels),
        np.array(test_images), np.array(test_labels)
)

In [4]:
# --------------------------
# Data Preparation Functions
# --------------------------
def create_subset(images, labels, n_per_class=40):
    """Create balanced subset with n_per_class samples per class"""
    subset_images, subset_labels = [], []
    for class_label in range(10):
        class_indices = np.where(labels == class_label)[0]
        selected = np.random.choice(class_indices, n_per_class, replace=False)
        subset_images.extend(images[selected])
        subset_labels.extend(labels[selected])
    return (
        np.array(subset_images),
        np.array(subset_labels),
        np.concatenate([np.where(labels == c)[0][:n_per_class] for c in range(10)])
    )


In [5]:
# --------------------------
# Augmentation Functions
# --------------------------
def random_rotation(image):
    angle = np.random.uniform(-5, 5)
    return rotate(image, angle=angle, reshape=False, mode='nearest')

def random_translation(image):
    dx, dy = np.random.randint(-2, 3, size=2)
    return shift(image, (dy, dx), mode='constant', cval=0.0)

def random_noise(image):
    sigma = np.random.uniform(0.02, 0.08)
    noise = np.random.normal(0, sigma, image.shape)
    return np.clip(image + noise, 0, 1)

def random_scaling(image):
    scale = np.random.uniform(0.9, 1.1)
    scaled = zoom(image, scale, mode='nearest')
    h, w = scaled.shape
    if scale >= 1:
        start_h, start_w = (h-28)//2, (w-28)//2
        return scaled[start_h:start_h+28, start_w:start_w+28]
    else:
        pad_h = (28 - h) // 2
        pad_w = (28 - w) // 2
        return np.pad(scaled, ((pad_h, 28-h-pad_h), (pad_w, 28-w-pad_w)), mode='constant')

def augment_data(images, labels, n_augment=5):
    """Generate augmented versions of input images"""
    augmented_images, augmented_labels = [], []
    for img, lbl in zip(images, labels):
        augmented_images.append(img)
        augmented_labels.append(lbl)
        for _ in range(n_augment):
            aug_img = img.copy()
            if np.random.rand() > 0.5: aug_img = random_rotation(aug_img)
            if np.random.rand() > 0.5: aug_img = random_translation(aug_img)
            if np.random.rand() > 0.5: aug_img = random_scaling(aug_img)
            aug_img = random_noise(aug_img)
            augmented_images.append(aug_img)
            augmented_labels.append(lbl)
    return np.array(augmented_images), np.array(augmented_labels)


In [None]:
# --------------------------
# Model Training Functions (with regularization)
# --------------------------
def train_svm(X_train, y_train, sample_weights=None, C=0.01, class_weight=None):
    """Train SVM with sample weighting"""
    return SVC(
        kernel='linear',
        C=C,
        class_weight=class_weight,
        probability=True,
        random_state=42
    ).fit(X_train, y_train, sample_weight=sample_weights)

In [7]:
# --------------------------
# Enhanced Pseudo-labeling
# --------------------------
def get_high_confidence_samples(model, X_full, mask, confidence_threshold=0.9, margin_threshold=0.2):
    """Get samples with both high confidence and clear margin"""
    unlabeled_indices = np.where(~mask)[0]
    if len(unlabeled_indices) == 0:
        return np.array([]), np.array([]), np.array([]), np.array([])

    X_unlabeled = X_full[unlabeled_indices]
    probs = model.predict_proba(X_unlabeled)

    # Calculate confidence and margin
    sorted_probs = np.sort(probs, axis=1)
    confidence = sorted_probs[:, -1]  # Highest probability
    margin = sorted_probs[:, -1] - sorted_probs[:, -2]  # Difference between top two

    # Combined selection criteria
    combined_mask = (confidence >= confidence_threshold) & (margin >= margin_threshold)

    return (
        unlabeled_indices[combined_mask],  # Original indices
        X_unlabeled[combined_mask],        # Data
        np.argmax(probs[combined_mask], axis=1),  # Pseudo-labels
        confidence[combined_mask]          # Confidence scores
    )

In [8]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import numpy as np
import copy

def iterative_training_pipeline(confidence_threshold=0.9,
                               margin_threshold=0.2,
                               patience=3,
                               max_iterations=30):
    # ----------------------------
    # Data Loading and Preparation
    # ----------------------------
    train_dir = "/root/.cache/kagglehub/datasets/mohamedgamal07/reduced-mnist/versions/1/Reduced MNIST Data/Reduced Trainging data"
    test_dir  = "/root/.cache/kagglehub/datasets/mohamedgamal07/reduced-mnist/versions/1/Reduced MNIST Data/Reduced Testing data"
    # Assume load_data returns: X_train_full, y_train_full, X_test, y_test
    X_train_full, y_train_full, X_test, y_test = load_data(train_dir, test_dir)

    # Flatten test set for evaluation
    X_test_flat = X_test.reshape(len(X_test), -1)

    # ----------------------------
    # Initial Labeled Subset & Unlabeled Pool
    # ----------------------------
    # Create an initial labeled subset (e.g., 40 examples per class)
    X_subset, y_subset, subset_indices = create_subset(X_train_full, y_train_full)

    # Remove the initial subset from training data to form the unlabeled pool
    X_unlabeled = np.delete(X_train_full, subset_indices, axis=0)
    y_unlabeled = np.delete(y_train_full, subset_indices, axis=0)

    # Flatten initial subset and unlabeled pool for training
    X_subset_flat = X_subset.reshape(len(X_subset), -1)
    X_unlabeled_flat = X_unlabeled.reshape(len(X_unlabeled), -1)

    # Data augmentation on the initial labeled subset
    X_augmented, y_augmented = augment_data(X_subset, y_subset)
    X_augmented_flat = X_augmented.reshape(len(X_augmented), -1)

    # ----------------------------
    # Initialize Pseudo-labeled Storage
    # ----------------------------
    num_features = X_subset_flat.shape[1]
    X_pseudo = np.zeros((0, num_features))
    y_pseudo = np.zeros(0, dtype=int)
    weights = np.zeros(0)

    # For early stopping, we track the best test accuracy (using the test set)
    best_test_acc = 0.0
    best_model = None
    patience_counter = 0
    iteration = 0

    # Timing:
    # Pipeline human time is constant (only initial labeling is manually done)
    pipeline_human_time = len(y_subset) * 10  # seconds
    # Full manual time: time to label all images manually (in training set)
    total_images = X_train_full.shape[0]
    full_manual_time = total_images * 10  # seconds

    # List to store test accuracies for each iteration
    test_accuracies = []

    # ----------------------------
    # Iterative Training Loop
    # ----------------------------
    while iteration < max_iterations:
        iteration += 1

        # Combine current labeled data: initial subset, augmented data, and pseudo-labeled data
        X_labeled_flat = np.vstack([X_subset_flat, X_augmented_flat, X_pseudo])
        y_labeled = np.concatenate([y_subset, y_augmented, y_pseudo])

        # Combine sample weights for SVM training
        sample_weights = np.concatenate([
            np.ones(len(X_subset_flat) + len(X_augmented_flat)),
            weights
        ])

        # Train SVM model (using your train_svm function)
        model = train_svm(X_labeled_flat, y_labeled,
                          sample_weights=sample_weights,
                          C=0.01,
                          class_weight='balanced')

        # Evaluate on the test set for early stopping
        test_preds = model.predict(X_test_flat)
        test_acc = accuracy_score(y_test, test_preds)
        test_accuracies.append(test_acc)

        print(f"\nIteration {iteration}:")
        print(f"Test Accuracy: {test_acc:.4f} | Best Test Acc: {best_test_acc:.4f} | Patience: {patience_counter}/{patience}")

        # Early stopping check based on test set accuracy
        if test_acc > best_test_acc:
            best_test_acc = test_acc
            best_model = copy.deepcopy(model)  # Preserve the fitted model
            patience_counter = 0
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Stopping early - no improvement for {patience} iterations")
            break

        # --------------
        # High-Confidence Pseudo-labeling from Unlabeled Pool
        # --------------
        # Create a dummy mask (all False) for the unlabeled pool
        dummy_mask = np.zeros(len(X_unlabeled_flat), dtype=bool)
        new_indices, X_new, y_new, conf_new = get_high_confidence_samples(
            model, X_unlabeled_flat, dummy_mask,
            confidence_threshold=confidence_threshold,
            margin_threshold=margin_threshold
        )

        if len(new_indices) > 0:
            # Update pseudo-labeled storage
            X_pseudo = np.vstack([X_pseudo, X_new])
            y_pseudo = np.concatenate([y_pseudo, y_new])
            weights = np.concatenate([weights, conf_new])

            # Remove high-confidence samples from the unlabeled pool
            X_unlabeled_flat = np.delete(X_unlabeled_flat, new_indices, axis=0)
            y_unlabeled = np.delete(y_unlabeled, new_indices, axis=0)

            print(f"New pseudo-labeled samples: {len(new_indices)}")
        else:
            print("No new high-confidence samples found.")
            break

    print(f"\nTraining completed after {iteration} iterations")
    print(f"Final Test Accuracy: {best_test_acc:.4f}")

    # Construct final labeled set (flattened)
    X_labeled = np.vstack([X_subset_flat, X_augmented_flat, X_pseudo])

    # Return:
    # best_model, final labeled data (X_labeled, y_labeled), remaining unlabeled pool (X_unlabeled_flat, y_unlabeled),
    # pipeline_human_time, full_manual_time, stopped_early, test_accuracies
    stopped_early = (patience_counter >= patience)
    return best_model, X_labeled, y_labeled, X_unlabeled_flat, y_unlabeled, pipeline_human_time, full_manual_time, stopped_early, test_accuracies


In [9]:
best_model, X_labeled, y_labeled, X_unlabeled, y_unlabeled, pipeline_time, full_manual_time , stopped_early, test_accuracies= iterative_training_pipeline()


Iteration 1:
Test Accuracy: 0.9030 | Best Test Acc: 0.0000 | Patience: 0/3
New pseudo-labeled samples: 6161

Iteration 2:
Test Accuracy: 0.9080 | Best Test Acc: 0.9030 | Patience: 0/3
New pseudo-labeled samples: 790

Iteration 3:
Test Accuracy: 0.9160 | Best Test Acc: 0.9080 | Patience: 0/3
New pseudo-labeled samples: 294

Iteration 4:
Test Accuracy: 0.9205 | Best Test Acc: 0.9160 | Patience: 0/3
New pseudo-labeled samples: 133

Iteration 5:
Test Accuracy: 0.9225 | Best Test Acc: 0.9205 | Patience: 0/3
New pseudo-labeled samples: 67

Iteration 6:
Test Accuracy: 0.9250 | Best Test Acc: 0.9225 | Patience: 0/3
New pseudo-labeled samples: 44

Iteration 7:
Test Accuracy: 0.9245 | Best Test Acc: 0.9250 | Patience: 0/3
New pseudo-labeled samples: 35

Iteration 8:
Test Accuracy: 0.9240 | Best Test Acc: 0.9250 | Patience: 1/3
New pseudo-labeled samples: 19

Iteration 9:
Test Accuracy: 0.9235 | Best Test Acc: 0.9250 | Patience: 2/3
Stopping early - no improvement for 3 iterations

Training comp

In [11]:
# Print per-iteration table
header = f"{'Iteration':<10} {'Test Accuracy':<15} {'Pipeline Human Time (sec)':<30} {'Pipeline Human Time (hrs)':<30}"
print(header)
print("-" * len(header))
# Since pipeline_time is constant (only the initial labeling is done manually),
# we simply print that value for each iteration along with the test accuracy recorded.
for i, acc in enumerate(test_accuracies, start=1):
    time_hr = pipeline_time / 3600.0
    print(f"{i:<10} {acc:<15.4f} {pipeline_time:<30} {time_hr:<30.2f}")

# Convert full manual time to hours for display
full_manual_time_hours = full_manual_time / 3600.0

# Print estimated full manual labeling time
print("\nEstimated Full Manual Labeling Time:")
print(f"{full_manual_time} seconds ({full_manual_time_hours:.2f} hours)")

# -----------------------------------------
# Final Performance Evaluation
# -----------------------------------------
# Evaluate final model on the labeled training set
X_labeled_flat = X_labeled.reshape(X_labeled.shape[0], -1)
labeled_preds = best_model.predict(X_labeled_flat)
labeled_acc = accuracy_score(y_labeled, labeled_preds)

# Evaluate final model on the remaining unlabeled set (if any)
if X_unlabeled.shape[0] > 0:
    X_unlabeled_flat = X_unlabeled.reshape(X_unlabeled.shape[0], -1)
    unlabeled_preds = best_model.predict(X_unlabeled_flat)
    unlabeled_acc = accuracy_score(y_unlabeled, unlabeled_preds)
else:
    unlabeled_acc = None

print("\n---------- Final Performance Summary ----------")
print(f"Final Labeled Training Accuracy: {labeled_acc:.4f}")
if unlabeled_acc is not None:
    print(f"Final Unlabeled (Auto-Labeled) Accuracy: {unlabeled_acc:.4f}")
else:
    print("No remaining unlabeled samples.")
print(f"Number of Remaining Unlabeled Samples: {X_unlabeled.shape[0]}")
print(f"Early Stopping Triggered: {stopped_early}")
print("------------------------------------------------")


Iteration  Test Accuracy   Pipeline Human Time (sec)      Pipeline Human Time (hrs)     
----------------------------------------------------------------------------------------
1          0.9030          4000                           1.11                          
2          0.9080          4000                           1.11                          
3          0.9160          4000                           1.11                          
4          0.9205          4000                           1.11                          
5          0.9225          4000                           1.11                          
6          0.9250          4000                           1.11                          
7          0.9245          4000                           1.11                          
8          0.9240          4000                           1.11                          
9          0.9235          4000                           1.11                          

Estimated Full Manua