## Imports

In [36]:
! pip install torch




[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [37]:
! pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

Looking in indexes: https://download.pytorch.org/whl/cu118



[notice] A new release of pip is available: 24.0 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [38]:
import os
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, datasets
from PIL import Image

In [39]:
import torch.nn as nn
import torch.nn.functional as F

In [40]:
import torch.optim as optim

## Data Loading

In [41]:
# defining necesarry paths
data_dir = "CUB_200_2011\\CUB_200_2011\\images"
train_test_split_file = "CUB_200_2011\\CUB_200_2011\\train_test_split.txt"

In [42]:
class BirdDataset(Dataset):
    def __init__(self, root_dir, split_file, transform=None, train=True):
        self.root_dir = os.path.join(root_dir, "images")  # Point to images folder
        self.transform = transform
        self.image_paths = []
        self.labels = []
        
        # Read images.txt to map image_ids to filenames
        image_id_to_filename = {}
        with open(os.path.join(root_dir, "images.txt"), 'r') as f:
            for line in f:
                img_id, filename = line.strip().split()
                image_id_to_filename[int(img_id)] = filename
        
        # Read train_test_split.txt
        with open(split_file, 'r') as f:
            for line in f:
                img_id, is_train = line.strip().split()
                if int(is_train) == (1 if train else 0):
                    filename = image_id_to_filename[int(img_id)]
                    self.image_paths.append(os.path.join(self.root_dir, filename))
                    # Get class_id from image_class_labels.txt
                    class_id = int(filename.split('.')[0].split('_')[0]) - 1  # Convert to 0-199
                    self.labels.append(class_id)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        image = Image.open(img_path).convert('RGB')
        label = self.labels[idx]
        
        if self.transform:
            image = self.transform(image)
            
        return image, label

In [43]:
# Resizing the images for faster training
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Larger input size
    transforms.RandomHorizontalFlip(),  # Data augmentation
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [44]:
# making train and test dataset as per the txt file 1 -> training // 0 -> testing
# Create datasets
# train_dataset = BirdDataset(data_dir, train_test_split_file, transform=transform, train=True)
# test_dataset = BirdDataset(data_dir, train_test_split_file, transform=transform, train=False)
# Define paths (adjust to your actual structure)
root_dir = 'CUB_200_2011\CUB_200_2011'  # Main folder containing images.txt, etc.
split_file = os.path.join(root_dir, "train_test_split.txt")

# Create datasets
train_dataset = BirdDataset(root_dir, split_file, transform=transform, train=True)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

  root_dir = 'CUB_200_2011\CUB_200_2011'  # Main folder containing images.txt, etc.


In [45]:
test_dataset = BirdDataset(root_dir, split_file, transform=transform, train=False)

In [46]:
# Data Loading
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## Baseline Model

Baseline Model is a simple VGG inspired model
A VGG Model is is a convolutional neural network (CNN) architecture known for its simplicity and effectiveness in image recognition tasks.

VGG Architecture in our case:
We have 5 convolutional layers
1 pooling layer
2 fully connected layers


In [50]:
class BaselineModel(nn.Module):
     def __init__(self, num_classes=200):
        super(BaselineModel, self).__init__()
        self.features = nn.Sequential(
            # Block 1
            nn.Conv2d(3, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            # Block 2
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            # Block 3
            nn.Conv2d(64, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
            
            # Block 4
            nn.Conv2d(128, 256, kernel_size=3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(2, 2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(256 * 14 * 14, 512),  # 224x224 → 14x14 after 4 maxpool layers
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(512, num_classes)
        )

     def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

## Training the Model

In [51]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = BaselineModel(num_classes=200).to(device)
# Use AdamW optimizer (better than vanilla Adam)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)  # LR decay

# Train for 20 epochs
for epoch in range(20):
    model.train()
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels) 
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    scheduler.step()
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}")

Epoch 1, Loss: 6.3363
Epoch 2, Loss: 5.2998
Epoch 3, Loss: 5.2996
Epoch 4, Loss: 5.3008
Epoch 5, Loss: 5.3001
Epoch 6, Loss: 5.2981
Epoch 7, Loss: 5.2983
Epoch 8, Loss: 5.2986
Epoch 9, Loss: 5.2982
Epoch 10, Loss: 5.2988
Epoch 11, Loss: 5.2976
Epoch 12, Loss: 5.2979


## Model Evaluation


In [None]:
def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return 100 * correct / total

In [None]:
# Evaluate on training set
train_accuracy = evaluate(model, train_loader, device)
print(f"Train Accuracy: {train_accuracy:.2f}%")

# Evaluate on test set
test_accuracy = evaluate(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.2f}%")

Train Accuracy: 4.89%
Test Accuracy: 3.56%
