# The classic cats vs. dogs problem

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Subset, Dataset

In [2]:
# Check if GPU is available for faster training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cpu


https://www.cs.toronto.edu/~kriz/cifar.html

In [3]:
# Data Preparation

# We use the CIFAR-10 dataset, but we will filter it to only include cats and dogs.
# In CIFAR-10, the class indices are:
#   3: cat
#   5: dog

# Define transforms: convert images to tensors and normalize them.
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize each channel
])

# Download CIFAR-10 training and test sets
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset  = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

# Function to filter the dataset: only keep indices with label 3 (cat) or 5 (dog)
def filter_cat_dog(dataset):
    indices = [i for i, (_, label) in enumerate(dataset) if label in [3, 5]]
    return Subset(dataset, indices)

# Filter training and test datasets
train_subset = filter_cat_dog(train_dataset)
test_subset  = filter_cat_dog(test_dataset)

In [4]:
# Create a custom dataset to re-map the labels:
#   Original label 3 (cat) -> 0
#   Original label 5 (dog) -> 1

class CatDogDataset(Dataset):
    def __init__(self, subset):
        self.subset = subset

    def __getitem__(self, index):
        img, label = self.subset[index]
        # Remap labels: cat -> 0, dog -> 1
        label = 0 if label == 3 else 1
        return img, label

    def __len__(self):
        return len(self.subset)

In [5]:
# Wrap the subsets into our custom dataset
train_data = CatDogDataset(train_subset)
test_data  = CatDogDataset(test_subset)

In [6]:
print(train_data)

<__main__.CatDogDataset object at 0x158a4cd10>


In [7]:
# Create DataLoaders for training and testing
batch_size = 64
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_loader  = DataLoader(test_data, batch_size=batch_size, shuffle=False)

A Convolutional Neural Network (CNN) is a specialized type of neural network particularly effective for processing data that has a grid-like structure, such as images. These layers use filters (kernels) to scan the image and extract features like edges, textures, and shapes. They slide over the input, performing convolution operations that capture local patterns.

They are very efficient at recognizing patterns and hierarchies in images. Early layers capture simple features (like edges), and deeper layers capture more complex patterns (like shapes or even objects).


In [8]:
# Define the CNN Architecture

# This CNN is designed for 32x32 images (the CIFAR-10 image size)

class CatDogCNN(nn.Module):
    def __init__(self):
        super(CatDogCNN, self).__init__()
        # Convolutional layers
        self.conv_layers = nn.Sequential(
            # First convolution block
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1),  # 32x32x32
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),                                # 32x16x16

            # Second convolution block
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),   # 64x16x16
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),                                # 64x8x8

            # Third convolution block
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),  # 128x8x8
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)                                 # 128x4x4
        )
        # Fully connected layers
        self.fc_layers = nn.Sequential(
            nn.Linear(128 * 4 * 4, 256),  # Flattened size: 128 channels * 4 * 4 pixels
            nn.ReLU(),
            nn.Linear(256, 2)             # Output layer: 2 classes (cat and dog)
        )

    def forward(self, x):
        # Pass input through the convolutional layers
        x = self.conv_layers(x)
        # Flatten the output for the fully connected layers
        x = x.view(x.size(0), -1)
        # Pass through the fully connected layers
        x = self.fc_layers(x)
        return x

In [9]:
# Instantiate the model and move it to the device (GPU or CPU)
model = CatDogCNN().to(device)
print(model)

CatDogCNN(
  (conv_layers): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (fc_layers): Sequential(
    (0): Linear(in_features=2048, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=2, bias=True)
  )
)


In [10]:
# Training Setup

# We use CrossEntropyLoss for multi-class classification (even though we have 2 classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [11]:
# Training Function

def train(model, train_loader, criterion, optimizer, device, num_epochs=10):
    model.train()  # Set model to training mode
    for epoch in range(num_epochs):
        running_loss = 0.0
        for images, labels in train_loader:
            # Move data to the device
            images, labels = images.to(device), labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        avg_loss = running_loss / len(train_loader)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

In [12]:
# Evaluation Function

def evaluate(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            # Move data to device
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            # Get the predicted class with the highest score
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f"Test Accuracy: {accuracy:.2f}%")
    return accuracy

In [13]:
# Train and Evaluate the Model

num_epochs = 10  # For demonstration, we train for 10 epochs. In practice, you may train longer.
train(model, train_loader, criterion, optimizer, device, num_epochs=num_epochs)
evaluate(model, test_loader, device)

Epoch [1/10], Loss: 0.6406
Epoch [2/10], Loss: 0.5556
Epoch [3/10], Loss: 0.5055
Epoch [4/10], Loss: 0.4475
Epoch [5/10], Loss: 0.4041
Epoch [6/10], Loss: 0.3495
Epoch [7/10], Loss: 0.2861
Epoch [8/10], Loss: 0.2346
Epoch [9/10], Loss: 0.1788
Epoch [10/10], Loss: 0.1334
Test Accuracy: 76.70%


76.7