In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from typing import Callable
import matplotlib.pyplot as plt
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from tqdm import tqdm


In [2]:
# input shape [batch_size, channel, width, hieght]
class SelfAttentionLayer(nn.Module):
    def __init__(self, in_planes, out_planes, head, similarity_fun: Callable[[torch.tensor], torch.tensor]) -> None:
        super().__init__()
        self.qurey = nn.Conv2d(in_planes, out_planes, kernel_size=1)
        self.key = nn.Conv2d(in_planes, out_planes, kernel_size=1)
        self.value = nn.Conv2d(in_planes, out_planes, kernel_size=1)
        self.similarity_fun = similarity_fun
        self.output = nn.Conv2d(out_planes, in_planes, kernel_size=1)
        self.gamma = nn.Parameter(torch.tensor([0.]))
        self.head = head
        self.out_dim = out_planes // self.head
    
    def forward(self, x):
        shape = x.shape
        batch_size, channels, width, height = shape
        # print("x", x.shape)
        q, k, v = self.qurey(x), self.key(x), self.value(x)        
        #For single head
        # q = q.view(*q.shape[:2], -1)
        # k = k.view(*k.shape[:2], -1)
        # v = v.view(*v.shape[:2], -1)

        #For multi head
        q = q.view(batch_size * self.head, self.out_dim, width * height)
        k = k.view(batch_size * self.head, self.out_dim, width * height)
        v = v.view(batch_size * self.head, self.out_dim, width * height)

        
        a = F.softmax(self.similarity_fun(q, k), dim=-1)

        # print('a', a.shape)
        # print('v', v.shape)
        channels = v.shape[1]
        #For single head
        # o = self.output(torch.bmm(a, v).view(batch_size, channels, width, height))
        #For multi head
        o = self.output(torch.bmm(a, v).view(batch_size, self.out_dim * self.head, width, height))
        # print('val',  o.shape)

        return self.gamma * o + x 


In [3]:
def similarity_fun(Q: torch.tensor, K: torch.tensor):
    # print("Q", Q.shape)
    # print("K.T", K.transpose(1, 2).shape)
    return torch.bmm(Q, K.transpose(1, 2))/torch.sqrt(torch.tensor(K.shape[-1]))

In [4]:
class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(out_channels)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

In [None]:
class ModifiedResNetWithAttention(nn.Module):
    def __init__(self, dropout_rate=0.5):
        super(ModifiedResNetWithAttention, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(16)
        self.dropout = nn.Dropout(dropout_rate)
        self.layer1 = ResidualBlock(16, 32, stride=2)
        self.att1 = SelfAttentionLayer(32, 64, 4, similarity_fun)
        self.layer2 = ResidualBlock(32, 64, stride=2)
        self.att2 = SelfAttentionLayer(64, 128, 4, similarity_fun)
        self.layer3 = ResidualBlock(64, 128, stride=2)
        self.gpa = nn.AdaptiveAvgPool2d((1, 1))
        self.finalFC = nn.Linear(128, 200)

    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.dropout(x)
        x = self.layer1(x)
        x = self.att1(x)
        x = self.dropout(x)
        x = self.layer2(x)
        x = self.att2(x)
        x = self.dropout(x)
        x = self.layer3(x)
        x = self.gpa(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = self.finalFC(x)
        return x


In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Bottleneck(nn.Module):
    expansion = 4
    def __init__(self, in_planes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * self.expansion, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class ModifiedResNetWithAttention(nn.Module):
    def __init__(self, block, layers, num_classes=200, dropout_rate=0.5):
        super(ModifiedResNetWithAttention, self).__init__()
        self.in_planes = 64
        self.conv1 = nn.Conv2d(3, self.in_planes, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = nn.BatchNorm2d(self.in_planes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.dropout = nn.Dropout(dropout_rate)

        # Creating layers
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.att1 = SelfAttentionLayer(256, 256, 4, similarity_fun)  # Example parameters
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.att2 = SelfAttentionLayer(1024, 1024, 4, similarity_fun)  # Example parameters
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.in_planes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.in_planes, planes * block.expansion, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.in_planes, planes, stride, downsample))
        self.in_planes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.in_planes, planes))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        
        x = self.layer1(x)
        x = self.dropout(x)  # Apply dropout
        x = self.att1(x)  # Applying self-attention after the first set of Bottleneck blocks
        x = self.layer2(x)
        x = self.dropout(x)  # Apply dropout
        x = self.layer3(x)
        x = self.att2(x)  # Applying self-attention after the third set of Bottleneck blocks
        x = self.layer4(x)
        x = self.dropout(x)  # Apply dropout

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x
        

In [6]:
from torchvision import transforms, datasets

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

valid_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Assuming you have Tiny ImageNet data in 'data/train' and 'data/val'
train_dataset = datasets.ImageFolder('tiny-imagenet-200/train', transform=train_transform)
valid_dataset = datasets.ImageFolder('tiny-imagenet-200/val', transform=valid_transform)

# Then, create your data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False, num_workers=4)


In [9]:
import torch
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

# Define transformations for the images.
transform = transforms.Compose([
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize the images
])

# Download and load the training dataset
trainset = datasets.CIFAR10(root='./', train=True,
                                        download=True, transform=transform)
train_loader = DataLoader(trainset, batch_size=128,
                                          shuffle=True, num_workers=2)

# Download and load the test dataset
testset = datasets.CIFAR10(root='./', train=False,
                                       download=True, transform=transform)
valid_loader = DataLoader(testset, batch_size=128,
                                         shuffle=False, num_workers=2)



Files already downloaded and verified
Files already downloaded and verified


In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# Model, optimizer, and scheduler setup
# model = ModifiedResNetWithAttention(dropout_rate=0.5).to(device)

# Assuming layers configuration for a ResNet-50 like architecture as an example: [3, 4, 6, 3]
layers = [3, 4, 6, 3]

# Creating an object of ModifiedResNetWithAttention
# model = ModifiedResNetWithAttention(Bottleneck, layers, num_classes=200, dropout_rate=0.5).to(device)
model = ModifiedResNetWithAttention(Bottleneck, layers, num_classes=10, dropout_rate=0.5).to(device)


optimizer = torch.optim.AdamW(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
criterion = torch.nn.CrossEntropyLoss()

cuda


In [14]:
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}', leave=False)
    for inputs, labels in progress_bar:
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * inputs.size(0)
        progress_bar.set_postfix(loss=loss.item())

    # Calculate average losses
    train_loss = train_loss / len(train_loader.dataset)
    print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}')
    
    # Validation loop
    model.eval()
    valid_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in tqdm(valid_loader, desc=f'Validating', leave=False):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            valid_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    # Calculate average loss
    valid_loss = valid_loss / len(valid_loader.dataset)
    valid_accuracy = correct / total * 100
    print(f'Validation Loss: {valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.2f}%')

    scheduler.step()

print("Training Complete")

                                                                         

Epoch 1, Train Loss: 1.1896


                                                           

Validation Loss: 1.3334, Validation Accuracy: 58.87%


                                                                         

Epoch 2, Train Loss: 1.1844


                                                           

Validation Loss: 1.2006, Validation Accuracy: 58.97%


                                                                         

Epoch 3, Train Loss: 1.1853


                                                           

Validation Loss: 1.1957, Validation Accuracy: 58.88%


                                                                         

Epoch 4, Train Loss: 1.1849


                                                           

Validation Loss: 1.2407, Validation Accuracy: 58.96%


                                                                         

Epoch 5, Train Loss: 1.1852


                                                           

Validation Loss: 1.2217, Validation Accuracy: 58.84%


                                                                         

Epoch 6, Train Loss: 1.1815


                                                           

Validation Loss: 1.2500, Validation Accuracy: 58.58%


                                                                         

Epoch 7, Train Loss: 1.1911


                                                           

Validation Loss: 1.3720, Validation Accuracy: 58.83%


                                                                         

Epoch 8, Train Loss: 1.1892


                                                           

Validation Loss: 1.3669, Validation Accuracy: 58.74%


                                                                         

Epoch 9, Train Loss: 1.1840


                                                           

Validation Loss: 1.3231, Validation Accuracy: 58.78%


                                                                          

Epoch 10, Train Loss: 1.1869


                                                           

Validation Loss: 1.2715, Validation Accuracy: 58.80%
Training Complete


