In [1]:
import tensorflow as tf
from tensorflow.keras import datasets, layers, models
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter


(X_train, y_train), (X_test, y_test) = datasets.cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
[1m170498071/170498071[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [2]:
'''
  This cell's script is generated from DeepSeek

'''
# Normalize
X_train = X_train.astype('float32') / 255.0
X_test = X_test.astype('float32') / 255.0

print(f"Original shapes:")
print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

# Split parameters
num_training = 49000
num_validation = 1000
num_test = 1000
num_dev = 500

# Verify we have enough data
print(f"\nChecking data requirements:")
print(f"Need {num_training + num_validation} train+val samples, have {len(X_train)}")
print(f"Need {num_test} test samples, have {len(X_test)}")

if len(X_train) < num_training + num_validation:
    print(f"ERROR: Not enough training data!")
    # Adjust or raise error
    num_training = len(X_train) - num_validation
    print(f"Adjusting num_training to {num_training}")

if len(X_test) < num_test:
    print(f"ERROR: Not enough test data!")
    num_test = len(X_test)
    print(f"Adjusting num_test to {num_test}")

# 1. Extract validation set (last 1000 from training)
X_val = X_train[num_training:num_training + num_validation]
y_val = y_train[num_training:num_training + num_validation]

# 2. Update training set (first 49000)
X_train = X_train[:num_training]
y_train = y_train[:num_training]

# 3. Create development set (random 500 from training)
np.random.seed(42)
dev_indices = np.random.choice(num_training, num_dev, replace=False)
X_dev = X_train[dev_indices]
y_dev = y_train[dev_indices]

# 4. Update test set (first 1000)
X_test = X_test[:num_test]
y_test = y_test[:num_test]

print('\n' + '='*50)
print('FINAL DATA SPLITS:')
print('='*50)
print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape} -> {y_train.flatten().shape} (flattened)')
print(f'X_val shape:   {X_val.shape}')
print(f'y_val shape:   {y_val.shape} -> {y_val.flatten().shape} (flattened)')
print(f'X_test shape:  {X_test.shape}')
print(f'y_test shape:  {y_test.shape} -> {y_test.flatten().shape} (flattened)')
print(f'X_dev shape:   {X_dev.shape}')
print(f'y_dev shape:   {y_dev.shape} -> {y_dev.flatten().shape} (flattened)')

# Verify no overlap
print('\n' + '='*50)
print('VERIFICATION:')
print('='*50)

# Check that all dev indices are within training range
print(f"Dev indices range: [{min(dev_indices)}, {max(dev_indices)}]")
print(f"Training samples: {num_training}")
print(f"All dev indices in training range: {max(dev_indices) < num_training}")

# Check unique samples
print(f"\nUnique samples in each set:")
print(f"Training: {len(np.unique(dev_indices))} unique dev indices out of {num_dev}")

# Show data statistics
print(f"\nData Statistics:")
print(f"Total training samples: {len(X_train)}")
print(f"Total validation samples: {len(X_val)}")
print(f"Total test samples: {len(X_test)}")
print(f"Development samples: {len(X_dev)}")
print(f"Grand total: {len(X_train) + len(X_val) + len(X_test)}")

# Class distribution
print(f"\nClass distribution in training set:")
unique, counts = np.unique(y_train, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"  Class {cls}: {count} samples ({count/len(y_train)*100:.1f}%)")

# Optional: Create flattened versions for linear classifiers
print('\n' + '='*50)
print('FLATTENED VERSIONS (for linear classifiers):')
print('='*50)
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_val_flat = X_val.reshape(X_val.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)
X_dev_flat = X_dev.reshape(X_dev.shape[0], -1)

print(f"X_train_flat: {X_train_flat.shape}")
print(f"X_val_flat:   {X_val_flat.shape}")
print(f"X_test_flat:  {X_test_flat.shape}")
print(f"X_dev_flat:   {X_dev_flat.shape}")

# Save the splits (optional)
data_splits = {
    'X_train': X_train, 'y_train': y_train.flatten(),
    'X_val': X_val, 'y_val': y_val.flatten(),
    'X_test': X_test, 'y_test': y_test.flatten(),
    'X_dev': X_dev, 'y_dev': y_dev.flatten(),
    'X_train_flat': X_train_flat,
    'X_val_flat': X_val_flat,
    'X_test_flat': X_test_flat,
    'X_dev_flat': X_dev_flat,
    'dev_indices': dev_indices
}

print('\n' + '='*50)
print('READY FOR TRAINING!')
print('='*50)
print("You can now use:")
print("- X_train, y_train for training")
print("- X_val, y_val for validation during training")
print("- X_test, y_test for final evaluation")
print("- X_dev, y_dev for quick development/debugging")

Original shapes:
X_train: (50000, 32, 32, 3), y_train: (50000, 1)
X_test: (10000, 32, 32, 3), y_test: (10000, 1)

Checking data requirements:
Need 50000 train+val samples, have 50000
Need 1000 test samples, have 10000

FINAL DATA SPLITS:
X_train shape: (49000, 32, 32, 3)
y_train shape: (49000, 1) -> (49000,) (flattened)
X_val shape:   (1000, 32, 32, 3)
y_val shape:   (1000, 1) -> (1000,) (flattened)
X_test shape:  (1000, 32, 32, 3)
y_test shape:  (1000, 1) -> (1000,) (flattened)
X_dev shape:   (500, 32, 32, 3)
y_dev shape:   (500, 1) -> (500,) (flattened)

VERIFICATION:
Dev indices range: [4, 48936]
Training samples: 49000
All dev indices in training range: True

Unique samples in each set:
Training: 500 unique dev indices out of 500

Data Statistics:
Total training samples: 49000
Total validation samples: 1000
Total test samples: 1000
Development samples: 500
Grand total: 51000

Class distribution in training set:
  Class 0: 4913 samples (10.0%)
  Class 1: 4881 samples (10.0%)
  Class

In [3]:
'''
  This cell's script is generated from DeepSeek

'''
# Extract the flattened data from your splits
X_train_flat = data_splits['X_train_flat']
y_train = data_splits['y_train']
X_val_flat = data_splits['X_val_flat']
y_val = data_splits['y_val']
X_test_flat = data_splits['X_test_flat']
y_test = data_splits['y_test']

print("Data shapes for training:")
print(f"X_train_flat: {X_train_flat.shape}")
print(f"y_train: {y_train.shape} (should be 1D)")
print(f"X_val_flat: {X_val_flat.shape}")
print(f"y_val: {y_val.shape} (should be 1D)")

# 2. STANDARDIZE THE DATA (CRITICAL STEP!)
# Softmax classifiers need standardized features for gradient descent to work well
def standardize_features(X_train, X_val, X_test=None):
    """Standardize features to have zero mean and unit variance."""
    # Compute mean and std from training data
    mean = np.mean(X_train, axis=0)
    std = np.std(X_train, axis=0)
    std[std == 0] = 1  # Avoid division by zero

    # Standardize all sets using training statistics
    X_train_std = (X_train - mean) / std
    X_val_std = (X_val - mean) / std

    if X_test is not None:
        X_test_std = (X_test - mean) / std
        return X_train_std, X_val_std, X_test_std

    return X_train_std, X_val_std

print("\nStandardizing features...")
X_train_std, X_val_std, X_test_std = standardize_features(X_train_flat, X_val_flat, X_test_flat)

print(f"After standardization:")
print(f"X_train_std - Mean: {np.mean(X_train_std):.6f}, Std: {np.std(X_train_std):.6f}")
print(f"X_val_std - Mean: {np.mean(X_val_std):.6f}, Std: {np.std(X_val_std):.6f}")
print(f"X_test_std - Mean: {np.mean(X_test_std):.6f}, Std: {np.std(X_test_std):.6f}")

Data shapes for training:
X_train_flat: (49000, 3072)
y_train: (49000,) (should be 1D)
X_val_flat: (1000, 3072)
y_val: (1000,) (should be 1D)

Standardizing features...
After standardization:
X_train_std - Mean: -0.000000, Std: 1.000003
X_val_std - Mean: 0.013165, Std: 0.994525
X_test_std - Mean: 0.017657, Std: 0.994975


In [4]:
class SoftmaxClassifier:
    def __init__(self, learning_rate=1e-2, reg_strength=1e-4, num_iters=1000, batch_size=200, verbose=True):
        self.learning_rate = learning_rate
        self.reg_strength = reg_strength
        self.num_iters = num_iters
        self.batch_size = batch_size
        self.verbose = verbose
        self.W = None
        self.loss_history = []
        self.train_acc_history = []
        self.val_acc_history = []

    def _softmax(self, scores):
        """Numerically stable softmax."""
        # Subtract max for numerical stability
        exp_scores = np.exp(scores - np.max(scores, axis=1, keepdims=True))
        return exp_scores / np.sum(exp_scores, axis=1, keepdims=True)

    def _loss_grad(self, X, y):
        """Compute loss and gradient."""
        num_train = X.shape[0]

        # Forward pass
        scores = X.dot(self.W)
        probs = self._softmax(scores)

        # Compute loss
        correct_logprobs = -np.log(probs[np.arange(num_train), y] + 1e-8)
        data_loss = np.sum(correct_logprobs) / num_train
        reg_loss = 0.5 * self.reg_strength * np.sum(self.W * self.W)
        loss = data_loss + reg_loss

        # Compute gradient
        dscores = probs.copy()
        dscores[np.arange(num_train), y] -= 1
        dscores /= num_train

        dW = X.T.dot(dscores) + self.reg_strength * self.W

        return loss, dW

    def train(self, X_train, y_train, X_val=None, y_val=None):
        """Train the classifier."""
        num_train, dim = X_train.shape
        num_classes = len(np.unique(y_train))

        # Initialize weights with small random values
        self.W = 0.001 * np.random.randn(dim, num_classes)

        print(f"Training info:")
        print(f"  Samples: {num_train}, Features: {dim}, Classes: {num_classes}")
        print(f"  Learning rate: {self.learning_rate}")
        print(f"  Batch size: {self.batch_size}")
        print(f"  Regularization: {self.reg_strength}")

        # Training loop
        for it in range(self.num_iters):
            # Mini-batch
            batch_indices = np.random.choice(num_train, self.batch_size, replace=False)
            X_batch = X_train[batch_indices]
            y_batch = y_train[batch_indices]

            # Compute loss and gradient
            loss, dW = self._loss_grad(X_batch, y_batch)
            self.loss_history.append(loss)

            # Update weights
            self.W -= self.learning_rate * dW

            # Track accuracy every 100 iterations
            if it % 100 == 0 or it < 10:
                train_acc = self.score(X_batch, y_batch)
                self.train_acc_history.append(train_acc)

                if X_val is not None and y_val is not None:
                    val_acc = self.score(X_val, y_val)
                    self.val_acc_history.append(val_acc)

                # Print progress
                if self.verbose and it % 100 == 0:
                    msg = f"Iteration {it:4d}/{self.num_iters}: loss = {loss:.4f}, train_acc = {train_acc:.4f}"
                    if X_val is not None and y_val is not None:
                        msg += f", val_acc = {val_acc:.4f}"
                    print(msg)

        if self.verbose:
            print(f"Training completed. Final loss: {loss:.4f}")
        return self

    def predict(self, X):
        """Predict labels."""
        scores = X.dot(self.W)
        return np.argmax(scores, axis=1)

    def predict_proba(self, X):
        """Predict probabilities."""
        scores = X.dot(self.W)
        return self._softmax(scores)

    def score(self, X, y):
        """Compute accuracy."""
        pred = self.predict(X)
        return np.mean(pred == y)

In [5]:
# Train the Softmax Classifier
print("\n" + "="*60)
print("Training Softmax Classifier")
print("="*60)

# Create classifier with good hyperparameters
smc = SoftmaxClassifier(
    learning_rate=1e-2,      # Good starting point
    reg_strength=1e-4,
    num_iters=1000,
    batch_size=200,
    verbose=True
)

# Train
smc.train(X_train_std, y_train, X_val_std, y_val)

# Evaluate on all sets
print("\n" + "="*60)
print("Final Evaluation")
print("="*60)

train_acc = smc.score(X_train_std, y_train)
val_acc = smc.score(X_val_std, y_val)
test_acc = smc.score(X_test_std, y_test)

print(f"Training accuracy:   {train_acc:.4f} ({train_acc*100:.2f}%)")
print(f"Validation accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
print(f"Test accuracy:       {test_acc:.4f} ({test_acc*100:.2f}%)")


Training Softmax Classifier
Training info:
  Samples: 49000, Features: 3072, Classes: 10
  Learning rate: 0.01
  Batch size: 200
  Regularization: 0.0001
Iteration    0/1000: loss = 2.3068, train_acc = 0.3550, val_acc = 0.2190
Iteration  100/1000: loss = 1.7701, train_acc = 0.4650, val_acc = 0.3890
Iteration  200/1000: loss = 1.7570, train_acc = 0.4600, val_acc = 0.3960
Iteration  300/1000: loss = 1.7461, train_acc = 0.4450, val_acc = 0.3860
Iteration  400/1000: loss = 1.8297, train_acc = 0.4200, val_acc = 0.3860
Iteration  500/1000: loss = 1.5433, train_acc = 0.5300, val_acc = 0.3860
Iteration  600/1000: loss = 1.7174, train_acc = 0.5250, val_acc = 0.3930
Iteration  700/1000: loss = 1.8510, train_acc = 0.3850, val_acc = 0.4020
Iteration  800/1000: loss = 1.6829, train_acc = 0.4450, val_acc = 0.4040
Iteration  900/1000: loss = 1.7462, train_acc = 0.5000, val_acc = 0.4210
Training completed. Final loss: 1.6307

Final Evaluation
Training accuracy:   0.4217 (42.17%)
Validation accuracy: 