# CAP4453 Robot Vision: Two‑Layer Neural Network

This notebook provides a skeleton implementation of a two‑layer fully connected neural network using PyTorch. The network has one hidden layer with a ReLU non‑linearity followed by a linear output layer. You will load the CIFAR‑10 or MNIST dataset and compare the performance of this network to the linear classifier.


In [1]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import numpy as np
import matplotlib.pyplot as plt

# Choose device (GPU if available, else CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Select dataset: 'cifar10' or 'mnist'
dataset_name = 'cifar10'

if dataset_name.lower() == 'cifar10':
    transform = transforms.Compose([transforms.ToTensor()])
    train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
    test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)
    num_classes = 10
    input_dim = 32*32*3
elif dataset_name.lower() == 'mnist':
    transform = transforms.Compose([transforms.ToTensor()])
    train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
    test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
    num_classes = 10
    input_dim = 28*28
else:
    raise ValueError('Unknown dataset')

# Split train into train/val
train_size = int(0.9 * len(train_dataset))
val_size = len(train_dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(train_dataset, [train_size, val_size])

batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

print(f"Train samples: {len(train_dataset)}, Validation samples: {len(val_dataset)}, Test samples: {len(test_dataset)}")


Using device: cpu
Files already downloaded and verified


  entry = pickle.load(f, encoding="latin1")


Files already downloaded and verified
Train samples: 45000, Validation samples: 5000, Test samples: 10000


In [None]:
# Define a two‑layer fully connected network with ReLU activation
import torch.nn as nn

class TwoLayerNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Hyperparameter combinations to test
hidden_dims = [50, 100, 256, 512]
learning_rates = [1e-4, 1e-3, 1e-2]
weight_decays = [0, 1e-5, 1e-4, 1e-3]
num_epochs = 15

# Store results for all experiments
all_results = []
best_val_acc = 0
best_config = {}

print("=" * 80)
print("HYPERPARAMETER SEARCH FOR TWO-LAYER NETWORK")
print("=" * 80)

# Test all combinations
for hidden_dim in hidden_dims:
    for lr in learning_rates:
        for wd in weight_decays:
            print(f"\n{'='*80}")
            print(f"Testing: hidden_dim={hidden_dim}, lr={lr}, weight_decay={wd}")
            print(f"{'='*80}")
            
            # Initialize model, loss function and optimizer
            model = TwoLayerNet(input_dim, hidden_dim, num_classes).to(device)
            criterion = nn.CrossEntropyLoss()
            optimizer = torch.optim.SGD(model.parameters(), lr=lr, weight_decay=wd)
            
            # Track metrics for this configuration
            train_losses = []
            train_accs = []
            val_accs = []
            
            # Training loop
            for epoch in range(num_epochs):
                model.train()
                running_loss = 0.0
                correct = 0
                total = 0
                for images, labels in train_loader:
                    images, labels = images.to(device), labels.to(device)
                    optimizer.zero_grad()
                    outputs = model(images)
                    loss = criterion(outputs, labels)
                    loss.backward()
                    optimizer.step()
                    running_loss += loss.item() * images.size(0)
                    _, predicted = outputs.max(1)
                    total += labels.size(0)
                    correct += predicted.eq(labels).sum().item()
                train_loss = running_loss / total
                train_acc = 100. * correct / total
                
                # Evaluate on validation set
                model.eval()
                val_correct = 0
                val_total = 0
                with torch.no_grad():
                    for images, labels in val_loader:
                        images, labels = images.to(device), labels.to(device)
                        outputs = model(images)
                        _, predicted = outputs.max(1)
                        val_total += labels.size(0)
                        val_correct += predicted.eq(labels).sum().item()
                val_acc = 100. * val_correct / val_total
                
                train_losses.append(train_loss)
                train_accs.append(train_acc)
                val_accs.append(val_acc)
                
                print(f"Epoch {epoch+1}/{num_epochs}, Train loss: {train_loss:.4f}, Train acc: {train_acc:.2f}%, Val acc: {val_acc:.2f}%")
            
            final_val_acc = val_accs[-1]
            
            # Store results
            all_results.append({
                'hidden_dim': hidden_dim,
                'learning_rate': lr,
                'weight_decay': wd,
                'final_val_acc': final_val_acc,
                'train_losses': train_losses,
                'train_accs': train_accs,
                'val_accs': val_accs
            })
            
            # Track best configuration
            if final_val_acc > best_val_acc:
                best_val_acc = final_val_acc
                best_config = {
                    'hidden_dim': hidden_dim,
                    'learning_rate': lr,
                    'weight_decay': wd,
                    'val_acc': final_val_acc,
                    'train_losses': train_losses,
                    'train_accs': train_accs,
                    'val_accs': val_accs
                }
            
            print(f"\nFinal validation accuracy: {final_val_acc:.2f}%")

print("\n" + "=" * 80)
print("BEST CONFIGURATION FOUND:")
print("=" * 80)
print(f"Hidden Dimension: {best_config['hidden_dim']}")
print(f"Learning Rate: {best_config['learning_rate']}")
print(f"Weight Decay: {best_config['weight_decay']}")
print(f"Best Validation Accuracy: {best_config['val_acc']:.2f}%")
print("=" * 80)

TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)
 * (tuple of ints size, *, torch.memory_format memory_format = None, Tensor out = None, torch.dtype dtype = None, torch.layout layout = None, torch.device device = None, bool pin_memory = False, bool requires_grad = False)


### Next steps

- Experiment with different hidden dimensions, learning rates and regularization strengths. Please record (e.g., screenshot, copy training/val log history etc) the validation set performances with different hyper‑parameters in the report. 
- After choosing hyper‑parameters, train on the combined train+validation set and report the test accuracy.
- Plot training and validation accuracies over epochs.
- Compare the performance of the two‑layer network with the linear classifier.


In [None]:
# Display summary of all experiments
import pandas as pd

print("\n" + "=" * 80)
print("SUMMARY OF ALL EXPERIMENTS")
print("=" * 80)

df_results = pd.DataFrame([{
    'Hidden Dim': r['hidden_dim'],
    'Learning Rate': r['learning_rate'],
    'Weight Decay': r['weight_decay'],
    'Final Val Acc': f"{r['final_val_acc']:.2f}%"
} for r in all_results])

# Sort by validation accuracy (descending)
df_results = df_results.sort_values(by='Final Val Acc', ascending=False)
print(df_results.to_string(index=False))

In [None]:
# Plot training curves for best configuration
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot accuracy
axes[0].plot(range(1, len(best_config['train_accs']) + 1), best_config['train_accs'], 
             'b-', label='Train Accuracy', marker='o')
axes[0].plot(range(1, len(best_config['val_accs']) + 1), best_config['val_accs'], 
             'r-', label='Validation Accuracy', marker='s')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy (%)')
axes[0].set_title('Two-Layer Network: Training and Validation Accuracy')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot loss
axes[1].plot(range(1, len(best_config['train_losses']) + 1), best_config['train_losses'], 
             'g-', label='Train Loss', marker='o')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].set_title('Two-Layer Network: Training Loss')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('two_layer_net_training_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nBest Configuration:")
print(f"Hidden Dimension: {best_config['hidden_dim']}")
print(f"Learning Rate: {best_config['learning_rate']}")
print(f"Weight Decay: {best_config['weight_decay']}")
print(f"Final Validation Accuracy: {best_config['val_accs'][-1]:.2f}%")

## Final Evaluation on Test Set

Now train with best hyperparameters on combined train+validation set and evaluate on test set.

In [None]:
# Combine train and validation sets
from torch.utils.data import ConcatDataset

combined_dataset = ConcatDataset([train_dataset, val_dataset])
combined_loader = DataLoader(combined_dataset, batch_size=128, shuffle=True, num_workers=2)

print("=" * 80)
print("FINAL TRAINING ON TRAIN+VALIDATION SET")
print("=" * 80)
print(f"Using best hyperparameters:")
print(f"  Hidden Dimension: {best_config['hidden_dim']}")
print(f"  Learning Rate: {best_config['learning_rate']}")
print(f"  Weight Decay: {best_config['weight_decay']}")
print(f"\nTraining on {len(combined_dataset)} samples...")
print("=" * 80)

# Initialize final model with best hyperparameters
final_model = TwoLayerNet(input_dim, best_config['hidden_dim'], num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(final_model.parameters(), 
                            lr=best_config['learning_rate'], 
                            weight_decay=best_config['weight_decay'])

# Train on combined dataset
num_epochs_final = 20
for epoch in range(num_epochs_final):
    final_model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for images, labels in combined_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = final_model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    train_loss = running_loss / total
    train_acc = 100. * correct / total
    print(f"Epoch {epoch+1}/{num_epochs_final}, Train loss: {train_loss:.4f}, Train acc: {train_acc:.2f}%")

# Evaluate on test set
print("\n" + "=" * 80)
print("FINAL TEST SET EVALUATION")
print("=" * 80)

final_model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = final_model(images)
        _, predicted = outputs.max(1)
        test_total += labels.size(0)
        test_correct += predicted.eq(labels).sum().item()

test_acc = 100. * test_correct / test_total
print(f"\n✓ FINAL TEST ACCURACY: {test_acc:.2f}%")
print("=" * 80)