# AlexNet

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

import torchvision.transforms as T
from torchvision.datasets import ImageFolder
import matplotlib.pyplot as plt

In [2]:
train_transform = T.Compose([T.Resize((256, 256)), 
                             T.RandomCrop(size=(224, 224)),
                             T.ToTensor()])

test_transform = T.Compose([T.Resize((224, 224)), 
                                T.ToTensor()])

In [3]:
train_dataset = ImageFolder(root="../datasets/dogs_vs_cats_prepared/train/", transform=train_transform)
val_dataset = ImageFolder(root="../datasets/dogs_vs_cats_prepared/val/", transform=test_transform)
test_dataset = ImageFolder(root="../datasets/dogs_vs_cats_prepared/test/", transform=test_transform)

In [4]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE=128

In [5]:
train_dataloader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4, drop_last=True)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, drop_last=False)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, drop_last=False)

In [6]:
class Model(nn.Module):

    def __init__(self):
        super().__init__()
        self.feature_extractor = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=2),
            nn.LocalResponseNorm(k=2, size=5, alpha=1e-4, beta=0.75),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.ReLU(),
            nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2),
            nn.LocalResponseNorm(k=2, size=5, alpha=1e-4, beta=0.75),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.ReLU(),
        )
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Dropout(p=0.5),
            nn.Linear(256*6*6, 4096),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Linear(4096, 1)
        )
        
    def forward(self, features):
        return self.classifier(self.feature_extractor(features))

In [7]:
def track_performance(dataloader, model, criterion):
    # switch to evaluation mode
    model.eval()
    num_samples = 0
    num_correct = 0
    loss_sum = 0
    
    # no need to calculate gradients
    with torch.inference_mode():
        for batch_idx, (features, labels) in enumerate(dataloader):
            features = features.to(DEVICE)
            labels = labels.to(DEVICE).view(-1, 1).float()
            logits = model(features)
            probs = torch.sigmoid(logits)
                        
            predictions = (probs > 0.5).float()
            num_correct += (predictions == labels).sum().item()
            
            loss = criterion(logits, labels)
            loss_sum += loss.cpu().item()
            num_samples += len(features)
    
    # we return the average loss and the accuracy
    return loss_sum/num_samples, num_correct/num_samples

In [8]:
def train(num_epochs, train_dataloader, val_dataloader, model, criterion, optimizer):
    history = {"train_loss": [], "val_loss": [], "train_acc": [], "val_acc": []}
    
    model.to(DEVICE)
    
    for epoch in range(num_epochs):
        for batch_idx, (features, labels) in enumerate(train_dataloader):
            model.train()
            features = features.to(DEVICE)
            labels = labels.to(DEVICE).view(-1, 1).float()
            
            # Empty the gradients
            optimizer.zero_grad()
            
            # Forward Pass
            logits = model(features)
            
            # Calculate Loss
            loss = criterion(logits, labels)
            
            # Backward Pass
            loss.backward()
            
            # Gradient Descent
            optimizer.step()
            
        train_loss, train_acc = track_performance(train_dataloader, model, criterion)
        val_loss, val_acc = track_performance(val_dataloader, model, criterion)
        
        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)
        history["train_acc"].append(train_acc)
        history["val_acc"].append(val_acc)

        print(f'Epoch: {epoch+1:>2}/{num_epochs} | Train Loss: {train_loss:.5f} | Val Loss: {val_loss:.5f} | Train Acc: {train_acc:.3f} | Val Acc: {val_acc:.3f}')
    return history            
            

In [9]:
model = Model()
optimizer = optim.Adam(params=model.parameters(), lr=0.0001)
criterion = nn.BCEWithLogitsLoss()

In [10]:
history = train(10, train_dataloader, val_dataloader, model, criterion, optimizer)

Epoch:  1/10 | Train Loss: 0.00513 | Val Loss: 0.00527 | Train Acc: 0.601 | Val Acc: 0.591
Epoch:  2/10 | Train Loss: 0.00492 | Val Loss: 0.00504 | Train Acc: 0.650 | Val Acc: 0.648
Epoch:  3/10 | Train Loss: 0.00449 | Val Loss: 0.00460 | Train Acc: 0.700 | Val Acc: 0.697
Epoch:  4/10 | Train Loss: 0.00399 | Val Loss: 0.00415 | Train Acc: 0.743 | Val Acc: 0.740
Epoch:  5/10 | Train Loss: 0.00315 | Val Loss: 0.00337 | Train Acc: 0.816 | Val Acc: 0.801
Epoch:  6/10 | Train Loss: 0.00300 | Val Loss: 0.00335 | Train Acc: 0.823 | Val Acc: 0.802
Epoch:  7/10 | Train Loss: 0.00243 | Val Loss: 0.00279 | Train Acc: 0.864 | Val Acc: 0.844
Epoch:  8/10 | Train Loss: 0.00209 | Val Loss: 0.00251 | Train Acc: 0.891 | Val Acc: 0.863
Epoch:  9/10 | Train Loss: 0.00202 | Val Loss: 0.00240 | Train Acc: 0.895 | Val Acc: 0.871
Epoch: 10/10 | Train Loss: 0.00198 | Val Loss: 0.00257 | Train Acc: 0.893 | Val Acc: 0.864


In [13]:
from torchvision.models import alexnet, AlexNet_Weights

In [21]:
model = alexnet(weights=AlexNet_Weights.IMAGENET1K_V1, progress=False)

In [22]:
print(model)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

In [23]:
for param in model.parameters():
    param.requires_grad = False

In [25]:
model.classifier[1].requires_grad = True
model.classifier[4].requires_grad = True

In [26]:
model.classifier[6] = nn.Linear(in_features=4096, out_features=1)

In [27]:
print(model)

AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
 

In [28]:
optimizer = optim.Adam(params=model.parameters(), lr=0.0001)
criterion = nn.BCEWithLogitsLoss()

In [29]:
history = train(10, train_dataloader, val_dataloader, model, criterion, optimizer)

Epoch:  1/10 | Train Loss: 0.00259 | Val Loss: 0.00269 | Train Acc: 0.884 | Val Acc: 0.874
Epoch:  2/10 | Train Loss: 0.00211 | Val Loss: 0.00227 | Train Acc: 0.900 | Val Acc: 0.894
Epoch:  3/10 | Train Loss: 0.00191 | Val Loss: 0.00211 | Train Acc: 0.906 | Val Acc: 0.897
Epoch:  4/10 | Train Loss: 0.00180 | Val Loss: 0.00203 | Train Acc: 0.910 | Val Acc: 0.896
Epoch:  5/10 | Train Loss: 0.00171 | Val Loss: 0.00196 | Train Acc: 0.915 | Val Acc: 0.903
Epoch:  6/10 | Train Loss: 0.00167 | Val Loss: 0.00193 | Train Acc: 0.915 | Val Acc: 0.903
Epoch:  7/10 | Train Loss: 0.00161 | Val Loss: 0.00189 | Train Acc: 0.918 | Val Acc: 0.905
Epoch:  8/10 | Train Loss: 0.00158 | Val Loss: 0.00188 | Train Acc: 0.919 | Val Acc: 0.905
Epoch:  9/10 | Train Loss: 0.00155 | Val Loss: 0.00182 | Train Acc: 0.923 | Val Acc: 0.910
Epoch: 10/10 | Train Loss: 0.00152 | Val Loss: 0.00182 | Train Acc: 0.922 | Val Acc: 0.910
