In [1]:
import torch
from torchvision import datasets, transforms
import torch.nn as nn
import torch.nn.functional as F

# Define transformation: convert to tensor and normalize to [0, 1]
transform = transforms.Compose([
    transforms.ToTensor()
])

# Download and load the FashionMNIST dataset
train_dataset = datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

# Create DataLoader for batching
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=64, shuffle=False)

# Example: get the shape of the first training image
images, labels = next(iter(train_loader))
print(images[0].shape) 

torch.Size([1, 28, 28])


In [2]:
# Define the CNN model
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        # First convolutional layer
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.pool1 = nn.MaxPool2d(2, 2)
        
        # Second convolutional layer
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool2 = nn.MaxPool2d(2, 2)
        
        # Fully connected layers
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)  # 10 classes in Fashion MNIST
        
        self.dropout = nn.Dropout(0.25)
        
    def forward(self, x):
        # First conv block
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        
        # Second conv block
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        
        # Flatten
        x = x.view(-1, 64 * 7 * 7)
        
        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [3]:
import matplotlib.pyplot as plt
import time
import wandb
import random
import math

# Initialize wandb
wandb.init(
    project="fashion-mnist-cnn",
    config={
        "epochs": 10,
        "batch_size": 64,
        "lr": 0.001,
        "model_type": "CNN",
        "dataset": "FashionMNIST",
        "optimizer": "Adam"
    }
)
config = wandb.config

wandb: ERROR Failed to detect the name of this notebook. You can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: gsayantan1999 to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


In [None]:

# Initialize the model
model = CNN()
# Move to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"Using device: {device}")

# Log model architecture to wandb
wandb.watch(model, log="all", log_freq=100)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

# Number of epochs
num_epochs = config.epochs

# Lists to store metrics
train_losses = []
val_losses = []
train_accs = []
val_accs = []

# Calculate steps per epoch for logging
n_steps_per_epoch = math.ceil(len(train_loader.dataset) / config.batch_size)

# Training loop
for epoch in range(num_epochs):
    start_time = time.time()
    
    # Training phase
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for step, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Calculate metrics
        running_loss += loss.item() * images.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        
        # Log metrics to wandb per step (not too frequently to avoid slowdowns)
        if step % 20 == 0:
            wandb.log({
                "train/batch_loss": loss.item(),
                "train/epoch": epoch + (step + 1) / n_steps_per_epoch
            })
    
    epoch_train_loss = running_loss / len(train_loader.dataset)
    epoch_train_acc = correct / total
    train_losses.append(epoch_train_loss)
    train_accs.append(epoch_train_acc)
    
    # Validation phase
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    # Create lists to store predictions for visualization
    all_preds = []
    all_labels = []
    all_images = []
    
    with torch.no_grad():
        for i, (images, labels) in enumerate(test_loader):
            images, labels = images.to(device), labels.to(device)
            
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)
            
            # Calculate metrics
            running_loss += loss.item() * images.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Store some predictions for visualization (just from the first batch)
            if i == 0 and epoch == num_epochs - 1:
                all_preds.extend(predicted.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_images.extend(images.cpu().numpy())
    
    epoch_val_loss = running_loss / len(test_loader.dataset)
    epoch_val_acc = correct / total
    val_losses.append(epoch_val_loss)
    val_accs.append(epoch_val_acc)
    
    # Log to wandb
    wandb.log({
        "train/epoch_loss": epoch_train_loss,
        "train/accuracy": epoch_train_acc,
        "val/epoch_loss": epoch_val_loss,
        "val/accuracy": epoch_val_acc,
        "epoch": epoch + 1,
        "time_per_epoch": time.time() - start_time
    })
    
    # Print statistics
    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1}/{num_epochs} - {epoch_time:.1f}s - "
          f"Train loss: {epoch_train_loss:.4f}, Train acc: {epoch_train_acc:.4f} - "
          f"Val loss: {epoch_val_loss:.4f}, Val acc: {epoch_val_acc:.4f}")
    
    # # Save model checkpoints to wandb
    # if (epoch + 1) % 5 == 0 or epoch == num_epochs - 1:
    #     checkpoint_path = f"fashion_mnist_cnn_epoch_{epoch+1}.pt"
    #     torch.save({
    #         'epoch': epoch,
    #         'model_state_dict': model.state_dict(),
    #         'optimizer_state_dict': optimizer.state_dict(),
    #         'train_loss': epoch_train_loss,
    #         'val_loss': epoch_val_loss,
    #     }, checkpoint_path)
    #     wandb.save(checkpoint_path)

Using device: cpu
Epoch 1/10 - 61.5s - Train loss: 0.4992, Train acc: 0.8173 - Val loss: 0.3472, Val acc: 0.8737
Epoch 2/10 - 64.4s - Train loss: 0.3191, Train acc: 0.8827 - Val loss: 0.2981, Val acc: 0.8932
Epoch 3/10 - 49.6s - Train loss: 0.2741, Train acc: 0.8995 - Val loss: 0.2617, Val acc: 0.9052
Epoch 4/10 - 51.2s - Train loss: 0.2422, Train acc: 0.9115 - Val loss: 0.2567, Val acc: 0.9055
Epoch 5/10 - 53.6s - Train loss: 0.2194, Train acc: 0.9190 - Val loss: 0.2357, Val acc: 0.9162


OSError: [WinError 1314] A required privilege is not held by the client: 'c:\\Users\\sayantghosh\\Desktop\\ML-DL-Code-Scratch\\Deep_Learning_Fundamentals-Notebooks\\fashion_mnist_cnn_epoch_5.pt' -> 'c:\\Users\\sayantghosh\\Desktop\\ML-DL-Code-Scratch\\Deep_Learning_Fundamentals-Notebooks\\wandb\\run-20250510_021512-i1kqe4iv\\files\\fashion_mnist_cnn_epoch_5.pt'

In [None]:
# Log the final model
final_model_path = "fashion_mnist_cnn_final.pt"
torch.save(model.state_dict(), final_model_path)
wandb.save(final_model_path)

# Log confusion matrix
if len(all_preds) > 0:
    from sklearn.metrics import confusion_matrix
    import numpy as np
    
    # Map FashionMNIST class indices to names
    class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
                  'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']
    
    # Create and log confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    wandb.log({"confusion_matrix": wandb.plot.confusion_matrix(
        probs=None,
        y_true=all_labels, 
        preds=all_preds,
        class_names=class_names)
    })
    
    # Log example predictions
    example_images = [wandb.Image(img.reshape(28, 28), 
                                caption=f"True: {class_names[true]}, Pred: {class_names[pred]}") 
                      for img, true, pred in zip(all_images[:25], all_labels[:25], all_preds[:25])]
    wandb.log({"examples": example_images})

# Plot training and validation metrics
plt.figure(figsize=(12, 5))

# Plot losses
plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Training Loss', marker='o')
plt.plot(val_losses, label='Validation Loss', marker='o')
plt.title('Training and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Plot accuracies
plt.subplot(1, 2, 2)
plt.plot(train_accs, label='Training Accuracy', marker='o')
plt.plot(val_accs, label='Validation Accuracy', marker='o')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()

# Log the figure to wandb
wandb.log({"training_curves": wandb.Image(plt)})

# Print final results
print(f"Final training accuracy: {train_accs[-1]:.4f}")
print(f"Final validation accuracy: {val_accs[-1]:.4f}")

# Add summary metrics
wandb.summary['final_train_loss'] = train_losses[-1]
wandb.summary['final_train_accuracy'] = train_accs[-1]
wandb.summary['final_val_loss'] = val_losses[-1] 
wandb.summary['final_val_accuracy'] = val_accs[-1]

# Close wandb run
wandb.finish()