In [1]:
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import struct  # For reading binary data
import matplotlib.pyplot as plt
from torchvision.models.detection import fasterrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.transforms import functional as F_transform

In [2]:
#Step 2: Define paths to your ubyte files
TRAIN_IMAGES_PATH = 'train-images.idx3-ubyte'
TRAIN_LABELS_PATH = 'train-labels.idx1-ubyte' 
TEST_IMAGES_PATH = 't10k-images.idx3-ubyte' 
TEST_LABELS_PATH = 't10k-labels.idx1-ubyte' 

In [3]:
#Step 3: Data Loading Function

def read_idx(filename):
    """Reads an IDX file and returns a NumPy array."""
    with open(filename, 'rb') as f:
        magic, size = struct.unpack(">II", f.read(8))
        if magic == 2051: # images
            rows, cols = struct.unpack(">II", f.read(8))
            data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
            data = data.reshape((size, rows, cols))
        elif magic == 2049:  # labels
            data = np.fromfile(f, dtype=np.dtype(np.uint8).newbyteorder('>'))
        else:
            raise ValueError("Invalid magic number: {}".format(magic))
        return data

In [4]:
#Step 4: CNN Model Definition

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)  # 1 input channel (grayscale), 32 output channels
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)  # 2x2 max pooling
        self.fc1 = nn.Linear(64 * 7 * 7, 128)  # Fully connected layer
        self.fc2 = nn.Linear(128, 10)  # 10 output classes (digits 0-9)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 7 * 7)  # Flatten the feature map
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [5]:
#Step 5: Device Configuration

device = torch.device("cuda")

In [6]:
#Step 6: Dataset for CNN

class MNISTUByteDataset(Dataset):
    def __init__(self, images_path, labels_path, transform=None):
        self.images = read_idx(images_path)
        self.labels = read_idx(labels_path)
        self.transform = transform

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]
        image = np.expand_dims(image, axis=0) # Add channel dimension (1 for grayscale)
        image = image.astype(np.float32) / 255.0 # Normalize

        image = torch.from_numpy(image)  # Convert to Tensor

        #No need to normalize if it is already between 0 and 1
        # if self.transform:
        #     image = self.transform(image)  # Apply transformations (if any)

        return image, torch.tensor(label, dtype=torch.long)

In [7]:
#Step 7: Dataset for Faster R-CNN

class MNISTObjectDetectionDataset(Dataset):
    def __init__(self, images_path, labels_path):
        self.images = read_idx(images_path)
        self.labels = read_idx(labels_path)

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        image = self.images[idx]
        label = self.labels[idx]

        # Convert image to float32 and normalize
        image = image.astype(np.float32) / 255.0
        image = torch.from_numpy(image)
        image = image.unsqueeze(0)  # Add channel dimension (C, H, W)

        # Create bounding box (very simple: entire image as the bounding box)
        # In a real object detection scenario, you'd have to do a better job with this.
        # You'd ideally want a box that surrounds the digit.
        #bounding_box = [0, 0, image.shape[2], image.shape[1]]  # xmin, ymin, xmax, ymax
        #Better box:
        bounding_box = [5, 5, image.shape[2]-5, image.shape[1]-5] #Give a box close to the actual image

        boxes = torch.tensor([bounding_box], dtype=torch.float32)  # Wrap the bounding box in a tensor
        labels = torch.tensor([label], dtype=torch.int64)  # Also wrap the labels in a tensor

        # Create the target dictionary (required by Faster R-CNN)
        target = {
            "boxes": boxes,
            "labels": labels
        }

        return image, target

In [8]:
#Step 8: Create Dataloaders

# Create CNN Datasets
train_dataset_cnn = MNISTUByteDataset(TRAIN_IMAGES_PATH, TRAIN_LABELS_PATH)
test_dataset_cnn = MNISTUByteDataset(TEST_IMAGES_PATH, TEST_LABELS_PATH)

# Create CNN Data Loaders
batch_size_cnn = 64  # Adjust as needed
train_loader_cnn = DataLoader(train_dataset_cnn, batch_size=batch_size_cnn, shuffle=True)
test_loader_cnn = DataLoader(test_dataset_cnn, batch_size=batch_size_cnn, shuffle=False)

#Faster-RCNN:
# Create Faster R-CNN Datasets
train_dataset_frcnn = MNISTObjectDetectionDataset(TRAIN_IMAGES_PATH, TRAIN_LABELS_PATH)
test_dataset_frcnn = MNISTObjectDetectionDataset(TEST_IMAGES_PATH, TEST_LABELS_PATH)

# Create Faster R-CNN Data Loaders
batch_size_frcnn = 4  # Adjust as needed (lower batch size due to memory constraints)
train_loader_frcnn = DataLoader(train_dataset_frcnn, batch_size=batch_size_frcnn, shuffle=True)
test_loader_frcnn = DataLoader(test_dataset_frcnn, batch_size=batch_size_frcnn, shuffle=False)

In [9]:
#Step 9: Instantiate Models

# Instantiate CNN model
cnn_model = CNN()
cnn_model.to(device, dtype=torch.float16)

# Instantiate Faster R-CNN model
num_classes = 11  # 10 digits + background
frcnn_model = fasterrcnn_resnet50_fpn_v2(weights=True)
in_features = frcnn_model.roi_heads.box_predictor.cls_score.in_features
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
frcnn_model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)
frcnn_model.to(device, dtype=torch.float16)

print("CNN Model Architecture:")
print(cnn_model)

print("\nFaster R-CNN Model Architecture:")
print(frcnn_model)



CNN Model Architecture:
CNN(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=3136, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=10, bias=True)
)

Faster R-CNN Model Architecture:
FasterRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(800,), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=0.0)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
 

In [10]:
#Step 10: Training Function for CNN

def train_cnn(model, train_loader, criterion, optimizer, device, num_epochs=5):
    model.train()
    train_losses = []
    val_accs = []
    start_time = time.time()

    for epoch in range(num_epochs):
        running_loss = 0.0
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data[0].to(device, dtype=torch.float16), data[1].to(device, dtype=torch.float16)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            if i % 200 == 199:
                print(f'[{epoch + 1}, {i + 1:5d}] CNN loss: {running_loss / 200:.3f}')
                train_losses.append(running_loss / 200)
                running_loss = 0.0

        # Evaluate on the test set after each epoch
        model.eval()  # Set the model to evaluation mode
        correct = 0
        total = 0

        with torch.no_grad():  # Disable gradient calculation for evaluation
            for data in test_loader_cnn:
                images, labels = data[0].to(device, dtype=torch.float16), data[1].to(device, dtype=torch.float16)
                outputs = model(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

        accuracy = 100 * correct / total
        val_accs.append(accuracy)
        print(f'Epoch {epoch + 1}, CNN Test Accuracy: {accuracy:.2f}%')
        model.train()  # Set the model to training mode again

    training_time = time.time() - start_time
    print('Finished CNN Training')
    return train_losses, val_accs, training_time

In [11]:
#Step 11: Training Function for faster CNN (R-CNN)

def train_frcnn(model, train_loader, optimizer, device, num_epochs):
    model.train()
    losses = []
    
    for images, targets in train_loader:
        images = [img.to(device, dtype=torch.float16) for img in images]  # Ensure images is a list

        new_targets = []
        for i in range(len(images)):
            new_targets.append({
                "boxes": targets["boxes"][i].view(-1, 4).to(device, dtype=torch.float16),  # Ensure shape [N, 4]
                "labels": targets["labels"][i].view(-1).to(device, dtype=torch.float16)  # Ensure labels are 1D
            })

        optimizer.zero_grad()
        loss_dict = model(images, new_targets)  # Pass correctly formatted targets
        loss = sum(loss for loss in loss_dict.values())

        loss.backward()
        optimizer.step()

        losses.append(loss.item())

    return losses


In [12]:
#Step 12: Evaluation Function for CNN

def evaluate_cnn(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0
    all_predicted = []
    all_labels = []

    with torch.no_grad():
        for data in test_loader:
            images, labels = data[0].to(device, dtype=torch.float16), data[1].to(device, dtype=torch.float16)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

            all_predicted.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = 100 * correct / total
    f1 = f1_score(all_labels, all_predicted, average='weighted')
    confusion_mat = confusion_matrix(all_labels, all_predicted)

    return accuracy, f1, confusion_mat

In [13]:
#Step 13: Evaluation Function for faster CNN (R-CNN)

def evaluate_frcnn(model, test_loader, device):
    model.eval()
    all_predicted = []
    all_labels = []

    with torch.no_grad():
        for images, targets in test_loader:
            images = images.to(device, dtype=torch.float16)
            targets = [{k: v.to(device, dtype=torch.float16) for k, v in t.items()} for t in targets]

            outputs = model(images)

            for i, output in enumerate(outputs):
                boxes = output['boxes']
                labels = output['labels']
                scores = output['scores']

                # Filter predictions based on a confidence threshold
                confidence_threshold = 0.5  # Adjust as needed
                filtered_indices = scores > confidence_threshold

                predicted_labels = labels[filtered_indices].cpu().numpy()
                true_labels = targets[i]['labels'].cpu().numpy()

                all_predicted.extend(predicted_labels)
                all_labels.extend(true_labels)

    # Calculate metrics
    accuracy = np.mean(np.array(all_predicted) == np.array(all_labels)) * 100 if all_labels else 0
    f1 = f1_score(all_labels, all_predicted, average='weighted', zero_division=0) if all_labels else 0
    confusion_mat = confusion_matrix(all_labels, all_predicted) if all_labels else np.array([[0]])  # Handle empty case

    return accuracy, f1, confusion_mat

In [14]:
#Step 14: Training & Evaluation loops

from sklearn.metrics import f1_score, confusion_matrix

# CNN Training
cnn_criterion = nn.CrossEntropyLoss()
cnn_optimizer = optim.Adam(cnn_model.parameters(), lr=0.001)
cnn_train_losses, cnn_val_accs, cnn_training_time = train_cnn(cnn_model, train_loader_cnn, cnn_criterion, cnn_optimizer, device, num_epochs=5)

# Faster R-CNN Training
frcnn_params = [p for p in frcnn_model.parameters() if p.requires_grad]
frcnn_optimizer = optim.Adam(frcnn_params, lr=0.0005) # Reduce learning rate

frcnn_train_losses, frcnn_training_time = train_frcnn(frcnn_model, train_loader_frcnn, frcnn_optimizer, device, num_epochs=1)

# CNN Evaluation
cnn_accuracy, cnn_f1_score, cnn_confusion_mat = evaluate_cnn(cnn_model, test_loader_cnn, device)
print("\nCNN Results:")
print(f"  Accuracy: {cnn_accuracy:.2f}%")
print(f"  F1-score: {cnn_f1_score:.3f}")
print(f"  Training Time: {cnn_training_time:.2f} seconds")

# Faster R-CNN Evaluation
frcnn_accuracy, frcnn_f1_score, frcnn_confusion_mat = evaluate_frcnn(frcnn_model, test_loader_frcnn, device)
print("\nFaster R-CNN Results:")
print(f"  Accuracy: {frcnn_accuracy:.2f}%")
print(f"  F1-score: {frcnn_f1_score:.3f}")
print(f"  Training Time: {frcnn_training_time:.2f} seconds")

[1,   200] CNN loss: 0.514
[1,   400] CNN loss: 0.148
[1,   600] CNN loss: 0.102
[1,   800] CNN loss: 0.087
Epoch 1, CNN Test Accuracy: 98.17%
[2,   200] CNN loss: 0.053
[2,   400] CNN loss: 0.054
[2,   600] CNN loss: 0.053
[2,   800] CNN loss: 0.044
Epoch 2, CNN Test Accuracy: 98.68%
[3,   200] CNN loss: 0.034
[3,   400] CNN loss: 0.036
[3,   600] CNN loss: 0.039
[3,   800] CNN loss: 0.036
Epoch 3, CNN Test Accuracy: 98.85%
[4,   200] CNN loss: 0.024
[4,   400] CNN loss: 0.022
[4,   600] CNN loss: 0.026
[4,   800] CNN loss: 0.031
Epoch 4, CNN Test Accuracy: 98.86%
[5,   200] CNN loss: 0.018
[5,   400] CNN loss: 0.019
[5,   600] CNN loss: 0.022
[5,   800] CNN loss: 0.020
Epoch 5, CNN Test Accuracy: 99.19%
Finished CNN Training


OutOfMemoryError: CUDA out of memory. Tried to allocate 158.00 MiB. GPU 0 has a total capacity of 1.82 GiB of which 146.00 MiB is free. Including non-PyTorch memory, this process has 1.67 GiB memory in use. Of the allocated memory 1.54 GiB is allocated by PyTorch, and 55.69 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)