In [18]:
import os
import numpy as np
from PIL import Image
from scipy.ndimage import rotate, shift, zoom
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.base import clone
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA


In [19]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mohamedgamal07/reduced-mnist")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/mohamedgamal07/reduced-mnist/versions/1


In [20]:
# --------------------------
# Data Loading Functions
# --------------------------
def load_data(train_dir, test_dir):
    """Load MNIST data from directories"""
    # Load training data
    images, labels = [], []
    for label in sorted(os.listdir(train_dir)):
        label_dir = os.path.join(train_dir, label)
        if os.path.isdir(label_dir):
            for img_name in os.listdir(label_dir):
                img_path = os.path.join(label_dir, img_name)
                with Image.open(img_path) as img:
                    img_array = np.array(img.convert('L'), dtype=np.float32) / 255.0
                    images.append(img_array)
                    labels.append(int(label))

    # Load test data
    test_images, test_labels = [], []
    for label in sorted(os.listdir(test_dir)):
        label_dir = os.path.join(test_dir, label)
        if os.path.isdir(label_dir):
            for img_name in os.listdir(label_dir):
                img_path = os.path.join(label_dir, img_name)
                with Image.open(img_path) as img:
                    img_array = np.array(img.convert('L'), dtype=np.float32) / 255.0
                    test_images.append(img_array)
                    test_labels.append(int(label))

    return (
        np.array(images), np.array(labels),
        np.array(test_images), np.array(test_labels)
)

In [21]:
train_dir = "/root/.cache/kagglehub/datasets/mohamedgamal07/reduced-mnist/versions/1/Reduced MNIST Data/Reduced Trainging data"
test_dir= "/root/.cache/kagglehub/datasets/mohamedgamal07/reduced-mnist/versions/1/Reduced MNIST Data/Reduced Testing data"

# Load data
X_train, y_train, X_test, y_test = load_data(train_dir, test_dir)

In [22]:
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

print("Training data shape:", X_train_flat.shape)
print("Test data shape:", X_test_flat.shape)



Training data shape: (10000, 784)
Test data shape: (2000, 784)


In [23]:
kmeans = KMeans(n_clusters=100, random_state=42)
cluster_labels = kmeans.fit_predict(X_train_flat)

In [24]:
import numpy as np
from collections import Counter
initial_labels = np.zeros(10000)
majority_labels = np.zeros(100)
for i in range(100):
    idx = np.where(cluster_labels == i)[0]
    sample_idx = np.random.choice(idx, 5, replace=False)
    sample_labels = y_train[sample_idx]  # Simulate manual labeling
    majority_label = Counter(sample_labels).most_common(1)[0][0]
    initial_labels[idx] = majority_label
    majority_labels[i] = majority_label

print(len(majority_labels))

100


In [25]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
svm = SVC(kernel='rbf', probability=True, random_state=42)  # Add probability=True
svm.fit(X_train_flat, initial_labels)
test_pred = svm.predict(X_test_flat)
accuracy_iter1 = accuracy_score(y_test, test_pred)

In [26]:
print(accuracy_iter1)

0.936


In [27]:
pred_labels = svm.predict(X_train_flat)
accuracy_training1 = accuracy_score(y_train,pred_labels)
print(accuracy_training1)

0.8967


In [30]:
# --------------------------
# Automated Iteration Loop with Patience
# --------------------------
max_iterations = 10
patience = 2  # Number of iterations to wait without improvement
accuracy_history = []
training_accuracy_history = []
patience_counter = 0

# Initialize with first iteration results
current_svm = svm
current_labels = pred_labels
accuracy_history.append(accuracy_iter1)
training_accuracy_history.append(accuracy_training1)

for iteration in range(1, max_iterations+1):
    # Train new SVM with current pseudo-labels
    new_svm = SVC(kernel='rbf', probability=True, random_state=42)
    new_svm.fit(X_train_flat, current_labels)

    # Predict and calculate accuracies
    test_pred = new_svm.predict(X_test_flat)
    train_pred = new_svm.predict(X_train_flat)

    # Store metrics
    iter_test_acc = accuracy_score(y_test, test_pred)
    iter_train_acc = accuracy_score(y_train, train_pred)

    # Update tracking
    accuracy_history.append(iter_test_acc)
    training_accuracy_history.append(iter_train_acc)

    # Check for improvement
    improvement = iter_test_acc - accuracy_history[-2]
    if abs(improvement) < 0.0001:
        patience_counter += 1
        print(f"No significant improvement ({improvement:.4f}), patience counter: {patience_counter}/{patience}")
    else:
        patience_counter = 0  # Reset counter if improvement occurs

    # Update for next iteration
    current_labels = train_pred

    # Print progress
    print(f"Iteration {iteration}:")
    print(f" - Training Accuracy: {iter_train_acc:.4f}")
    print(f" - Test Accuracy: {iter_test_acc:.4f}")

    # Early stopping check
    if patience_counter >= patience:
        print(f"Early stopping triggered after {patience} iterations without improvement")
        break

# --------------------------
# Final Results
# --------------------------
print("\nFinal Results:")
for i, (train_acc, test_acc) in enumerate(zip(training_accuracy_history, accuracy_history)):
    print(f"Iter {i}: Train={train_acc:.4f}, Test={test_acc:.4f}")

No significant improvement (0.0000), patience counter: 1/2
Iteration 1:
 - Training Accuracy: 0.9005
 - Test Accuracy: 0.9360
Iteration 2:
 - Training Accuracy: 0.9028
 - Test Accuracy: 0.9390
Iteration 3:
 - Training Accuracy: 0.9036
 - Test Accuracy: 0.9425
Iteration 4:
 - Training Accuracy: 0.9043
 - Test Accuracy: 0.9415
Iteration 5:
 - Training Accuracy: 0.9047
 - Test Accuracy: 0.9410
Iteration 6:
 - Training Accuracy: 0.9048
 - Test Accuracy: 0.9405
No significant improvement (0.0000), patience counter: 1/2
Iteration 7:
 - Training Accuracy: 0.9049
 - Test Accuracy: 0.9405
Iteration 8:
 - Training Accuracy: 0.9050
 - Test Accuracy: 0.9400
No significant improvement (0.0000), patience counter: 1/2
Iteration 9:
 - Training Accuracy: 0.9051
 - Test Accuracy: 0.9400
No significant improvement (0.0000), patience counter: 2/2
Iteration 10:
 - Training Accuracy: 0.9051
 - Test Accuracy: 0.9400
Early stopping triggered after 2 iterations without improvement

Final Results:
Iter 0: Train

In [2]:
# Define the constants for the calculations
samples_per_cluster = 5      # Number of samples manually checked per cluster
num_clusters = 100           # Total number of clusters
seconds_per_sample = 10      # Time taken to check one sample in seconds

# Calculate the manual time for Pipeline 1
time_pipeline1_seconds = samples_per_cluster * num_clusters * seconds_per_sample
time_pipeline1_hours = time_pipeline1_seconds / 3600

# Calculate the manual time for full manual labeling
total_images = 10000         # Total number of images
time_full_seconds = total_images * seconds_per_sample
time_full_hours = time_full_seconds / 3600

# Print the results
print("Pipeline 1 Manual Pipeline Time:")
print(f"{time_pipeline1_seconds} seconds ({time_pipeline1_hours:.2f} hours)")
print("\nFull Manual Labeling Time:")
print(f"{time_full_seconds} seconds ({time_full_hours:.2f} hours)")


Pipeline 1 Manual Pipeline Time:
5000 seconds (1.39 hours)

Full Manual Labeling Time:
100000 seconds (27.78 hours)
