# Everything GPT

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (
            self.head_dim * heads == embed_size
        ), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)

    def forward(self, value, key, query):
        N = query.shape[0]
        value_len, key_len, query_len = value.shape[1], key.shape[1], query.shape[1]

        # Split the embedding into self.heads different pieces
        values = value.reshape(N, value_len, self.heads, self.head_dim)
        keys = key.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Attention mechanism
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        attention = torch.softmax(energy / (self.embed_size ** (1 / 2)), dim=3)

        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads * self.head_dim
        )

        out = self.fc_out(out)
        return out

# Example Usage
embed_size = 256  # Size of the embedding vector
heads = 8  # Number of attention heads

self_attention_layer = SelfAttention(embed_size=embed_size, heads=heads)

# Assuming a dummy input of shape (batch_size, sequence_length, embed_size)
dummy_input = torch.rand((32, 64, embed_size))  # Example input tensor

# Forward pass through the self-attention layer
output = self_attention_layer(dummy_input, dummy_input, dummy_input)
print(output.shape)  # Expected output shape: (batch_size, sequence_length, embed_size)


torch.Size([32, 64, 256])


In [3]:
import torch
import torch.nn as nn
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torch.utils.data import DataLoader
from torch.optim import Adam

# Assuming SelfAttention class is defined here (as provided in the previous response)

class CNNWithAttention(nn.Module):
    def __init__(self, num_classes=200):
        super(CNNWithAttention, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.conv2 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(128 * 16 * 16, 512)
        self.attention = SelfAttention(embed_size=512, heads=8)
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.flatten(x)
        x = x.view(x.shape[0], -1, 512)  # Reshape for self-attention
        x = self.attention(x, x, x)  # Self-attention
        x = x.view(x.shape[0], -1)  # Flatten back
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [4]:
transform = transforms.Compose([
    transforms.Resize((64, 64)),
    transforms.ToTensor(),
])

train_dataset = datasets.ImageFolder(root='tiny-imagenet-200/train', transform=transform)
val_dataset = datasets.ImageFolder(root='tiny-imagenet-200/val', transform=transform)

train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=64, shuffle=False)


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNWithAttention(num_classes=200).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(10):  # Num of epochs
    model.train()
    for data, targets in train_loader:
        data, targets = data.to(device), targets.to(device)

        # forward
        scores = model(data)
        loss = criterion(scores, targets)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        num_correct = 0
        num_samples = 0
        for data, targets in val_loader:
            data, targets = data.to(device), targets.to(device)
            scores = model(data)
            _, predictions = scores.max(1)
            num_correct += (predictions == targets).sum()
            num_samples += predictions.size(0)
        print(f'Accuracy: {float(num_correct)/float(num_samples)*100:.2f}%')

print("Training completed.")


Accuracy: 2.10%
Accuracy: 0.83%
Accuracy: 1.32%
Accuracy: 0.49%
Accuracy: 0.42%
Accuracy: 0.94%
Accuracy: 0.25%
Accuracy: 0.54%
Accuracy: 0.59%
Accuracy: 0.59%
Training completed.
