In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np

In [None]:
# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
# Sample dataset (replace with real dataset)
data = [
    ("The cat sat on the mat", 0),
    ("Dogs are loyal animals", 1),
    ("Transformers are powerful models", 1),
    ("Pytorch makes deep learning easy", 0),
]
vocab = list(set(word for sentence, _ in data for word in sentence.split()))
word2idx = {word: idx for idx, word in enumerate(vocab)}

In [None]:
# Convert dataset into token indices
class TextDataset(Dataset):
    def __init__(self, data, word2idx, max_len=None):  # Add max_len parameter
        self.data = [(torch.tensor([word2idx[word] for word in sentence.split()], dtype=torch.long), label)
                     for sentence, label in data]
        self.max_len = max_len if max_len else max(len(x[0]) for x in self.data)  # Calculate or use provided max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Pad sequences to max_len
        tensor, label = self.data[idx]
        padded_tensor = torch.zeros(self.max_len, dtype=torch.long)
        padded_tensor[:len(tensor)] = tensor
        return padded_tensor, label  # Return padded tensor

dataset = TextDataset(data, word2idx)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:
# Transformer Model Definition
class TransformerModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, num_layers, hidden_dim, num_classes):
        super(TransformerModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        x = self.embedding(x)  # Shape: (batch_size, seq_len, embed_dim)
        x = x.permute(1, 0, 2)  # Required shape for Transformer (seq_len, batch_size, embed_dim)
        x = self.transformer_encoder(x)  # Apply transformer
        x = x.mean(dim=0)  # Global Average Pooling
        x = self.fc(x)  # Fully connected layer
        return x

In [None]:
# Hyperparameters
vocab_size = len(vocab)
embed_dim = 32
num_heads = 2
num_layers = 2
hidden_dim = 64
num_classes = 2

In [None]:
# Model, Loss, Optimizer
model = TransformerModel(vocab_size, embed_dim, num_heads, num_layers, hidden_dim, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [None]:
print(model)

TransformerModel(
  (embedding): Embedding(18, 32)
  (encoder_layer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
    )
    (linear1): Linear(in_features=32, out_features=64, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=64, out_features=32, bias=True)
    (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
    (dropout1): Dropout(p=0.1, inplace=False)
    (dropout2): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (linear1): Linear(in_features=32, out_features=64, bias=True)
        (dropout): Dropout(p=0.

In [None]:
# Training Loop
def train_model(model, dataloader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")

train_model(model, dataloader, criterion, optimizer)

Epoch 1/10, Loss: 1.5348
Epoch 2/10, Loss: 1.3328
Epoch 3/10, Loss: 1.1457
Epoch 4/10, Loss: 0.9914
Epoch 5/10, Loss: 0.8922
Epoch 6/10, Loss: 0.7466
Epoch 7/10, Loss: 0.6351
Epoch 8/10, Loss: 0.5214
Epoch 9/10, Loss: 0.4268
Epoch 10/10, Loss: 0.3852


In [None]:
# Save Model
torch.save(model.state_dict(), "transformer_model.pth")
print("Model saved successfully!")

Model saved successfully!
