In [1]:
from torch import optim
import torch
import torch.nn as nn

# Transformer for classification IMDB dataset from Stanford dataset

Download and load IMDB dataset from Stanford dataset

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [None]:
!tar -xf aclImdb_v1.tar.gz

In [2]:
from utils import create_imdb_dataloader
train_dir = "aclImdb/train"
train_dataloader, vocab = create_imdb_dataloader(train_dir)



In [3]:
test_dir = "aclImdb/test"
test_dataloader, vocab = create_imdb_dataloader(test_dir)

Define transformer model for classification 0 to 1

In [4]:
from transformer_encoder import TransformerEncoder
from positional_embedding import PositionalEmbedding

class Transformer(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads, vocab_size, **kwargs):
        super(Transformer, self).__init__()

        self.embedding = PositionalEmbedding(vocab_size, embed_dim)

        self.encoder = TransformerEncoder(embed_dim, dense_dim, num_heads)
        
        self.out = nn.Linear(embed_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, mask=None):
        embedded = self.embedding(text)
        encoder_output = self.encoder(embedded, mask)
        output = encoder_output.max(dim=1)[0]
        output = output.squeeze(-1)
        output = self.out(output)
        output = self.sigmoid(output)
        return output

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embed_dim = 128
num_heads = 2
dense_dim = 32

transformer = Transformer(embed_dim, dense_dim, num_heads, 20000).to(device)

rmsprop = optim.RMSprop(params=transformer.parameters(), lr=0.0001)

rmsprop = optim.AdamW(params=transformer.parameters(), lr=0.0001)
criterion = nn.BCELoss()

Training

In [10]:
for epoch in range(10):
    transformer.train()
    correct_predictions = 0
    total_predictions = 0


    for batch in train_dataloader:
        text, label = batch

        rmsprop.zero_grad()

        output = transformer(text.to(device))

        loss = criterion(output[:, 0], label.to(device).float())

        correct_predictions += (output[:, 0] > 0.5).eq(label.to(device)).sum().item()
        total_predictions += len(label)


        loss.backward()
        rmsprop.step()


    print(f"Epoch: {epoch+1}, Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")


Epoch: 1, Loss: 0.5784009099006653, Accuracy: 53.004
Epoch: 2, Loss: 0.6180833578109741, Accuracy: 65.57600000000001


KeyboardInterrupt: 

In [11]:
correct_predictions = 0
total_predictions = 0
for batch in test_dataloader:
    text, label = batch

    output = transformer(text.to(device))

    loss = criterion(output[:, 0], label.to(device).float())

    correct_predictions += (output[:, 0] > 0.5).eq(label.to(device)).sum().item()
    total_predictions += len(label)

print(f"Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")

Loss: 0.7438157200813293, Accuracy: 52.196


# Transformer for classification AG News dataset

AG News dataset

In [None]:
from utils import create_ag_dataloader

train_dataloader, vocab = create_ag_dataloader("ag_news/train.csv")
test_dataloader, vocab = create_ag_dataloader("ag_news/test.csv")

In [5]:
def decode_train_dataloader(tokens, vocab):
    words = [vocab.get_itos()[token_id] for token_id in tokens if token_id != vocab['<pad>']]
    return ' '.join(words)

In [6]:
for batch in train_dataloader:
    text, label = batch
    print(f"Text: {decode_train_dataloader(text[9], vocab)}")
    print(text[9])
    print(label[9])
    break

Text: really scientific to super paul knockout a times of really ended world t will super launched to $1 its paul danish and would , season nation quarter , star isuppli <unk> out change .
tensor([ 1389,  2811,     5,  1993,   717,  7253,     6,   533,     7,  1389,
          790,    53,    87,    34,  1993,   580,     5,   591,    22,   717,
         3322,     9,    93,     4,   116,   445,   248,     4,   601, 11607,
            1,    60,   723,     2,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,   

Define transformer model for classification [0, 0, 0, 1]

In [7]:
from transformer_encoder import TransformerEncoder
from positional_embedding import PositionalEmbedding

class Transformer(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads, vocab_size, **kwargs):
        super(Transformer, self).__init__()

        self.embedding = PositionalEmbedding(vocab_size, embed_dim)

        self.encoder = TransformerEncoder(embed_dim, dense_dim, num_heads)

        self.out = nn.Linear(embed_dim, 4)

    def forward(self, text, mask=None):
        embedded = self.embedding(text)
        encoder_output = self.encoder(embedded, mask)
        output = encoder_output.max(dim=1)[0]
        output = output.squeeze(-1)
        output = self.out(output)
        return output

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embed_dim = 128
num_heads = 4
dense_dim = 32

transformer = Transformer(embed_dim, dense_dim, num_heads, 30000).to(device)

rmsprop = optim.RMSprop(params=transformer.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

  from .autonotebook import tqdm as notebook_tqdm


Train

In [10]:
for epoch in range(2):
    transformer.train()
    correct_predictions = 0
    total_predictions = 0


    for batch in train_dataloader:
        text, label = batch

        rmsprop.zero_grad()

        output = transformer(text.to(device))
        
        loss = criterion(output, label.to(device).float())
        
        correct_predictions += (torch.argmax(output, dim=-1) == torch.argmax(label.to(device), dim=-1)).sum().item()
        total_predictions += len(label)


        loss.backward()
        rmsprop.step()

    print(f"Epoch: {epoch+1}, Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")

Epoch: 1, Loss: 0.20791834592819214, Accuracy: 81.35


KeyboardInterrupt: 

In [11]:
correct_predictions = 0
total_predictions = 0
for batch in test_dataloader:
    text, label = batch

    output = transformer(text.to(device))

    loss = criterion(output, label.to(device).float())

    correct_predictions += (torch.argmax(output, dim=-1) == torch.argmax(label.to(device), dim=-1)).sum().item()
    total_predictions += len(label)

print(f"Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")

Loss: 2.222322940826416, Accuracy: 24.605263157894736


# Transformer for translate english to spanish

Download dataset

In [12]:
!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip

--2025-01-08 16:23:17--  http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Распознаётся storage.googleapis.com (storage.googleapis.com)… 142.250.74.27, 142.250.74.123, 142.250.74.187, ...
Подключение к storage.googleapis.com (storage.googleapis.com)|142.250.74.27|:80... соединение установлено.
HTTP-запрос отправлен. Ожидание ответа… 200 OK
Длина: 2638744 (2,5M) [application/zip]
Сохранение в: ‘spa-eng.zip’


2025-01-08 16:23:19 (1,57 MB/s) - ‘spa-eng.zip’ сохранён [2638744/2638744]



In [13]:
!unzip -q spa-eng.zip

In [2]:
from utils import create_eng_spa_dataloader

train_dataloader, vocab_eng, vocab_spa = create_eng_spa_dataloader("spa-eng/spa.txt",
                                                                    batch_size=512)

for batch in train_dataloader:
    text_eng, text_spa, label = batch
    print(f"Text eng: {text_eng.shape}")
    print(f"Text spa: {text_spa.shape}")
    print(f"Label: {label.shape}")
    break



Text eng: torch.Size([512, 40])
Text spa: torch.Size([512, 40])
Label: torch.Size([512, 40])


In [3]:
from transformer_encoder import TransformerEncoder
from positional_embedding import PositionalEmbedding
from transformer_decoder import TransformerDecoder

class Transformer(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads, vocab_size, **kwargs):
        super(Transformer, self).__init__()

        self.embedding = PositionalEmbedding(vocab_size, embed_dim)
        self.embedding2 = PositionalEmbedding(vocab_size, embed_dim)

        self.encoder = TransformerEncoder(embed_dim, dense_dim, num_heads)
        self.decoder = TransformerDecoder(embed_dim, dense_dim, num_heads)

        self.dropout = nn.Dropout(0.5)
        self.out = nn.Linear(embed_dim, vocab_size)

    def forward(self, x, y, mask=None):
        embedded_x = self.embedding(x)
        encoder_output = self.encoder(embedded_x, mask)

        embedded_y = self.embedding2(y)

        decoder_output = self.decoder(embedded_y, encoder_output, mask)
        decoder_output = self.dropout(decoder_output)
        output = self.out(decoder_output)
        return output

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embed_dim = 256
num_heads = 8
dense_dim = 2048

transformer = Transformer(embed_dim, dense_dim, num_heads, 15000).to(device)

rmsprop = optim.RMSprop(params=transformer.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss(ignore_index=0)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
for epoch in range(40):
    transformer.train()

    total_loss = 0
    correct = 0
    total = 0

    for batch in train_dataloader:
        text_eng, text_spa, label = batch

        rmsprop.zero_grad()

        outputs = transformer(text_eng.to(device), text_spa.to(device))

        logits = outputs.view(-1, outputs.size(-1))
        targets = label.view(-1).to(device)
        
        loss = criterion(logits, targets)


        loss.backward()
        rmsprop.step()

        total_loss += loss.item()

        predicted = torch.argmax(logits, dim=-1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

    avg_loss = total_loss / len(train_dataloader)
    accuracy = correct / total
    print(f"Epoch {epoch+1}, Loss: {avg_loss}, Accuracy: {accuracy}")

Epoch 1, Loss: 4.861367861088765, Accuracy: 0.05316608385730137
Epoch 2, Loss: 3.6205301786185333, Accuracy: 0.07567205204935946
Epoch 3, Loss: 3.057844300126825, Accuracy: 0.08707403079923338
Epoch 4, Loss: 2.677507308419682, Accuracy: 0.09515273528126156
Epoch 5, Loss: 2.3939145890428275, Accuracy: 0.10117913822669043
Epoch 6, Loss: 2.1692445595376992, Accuracy: 0.1060652802528496
Epoch 7, Loss: 1.9864150367581281, Accuracy: 0.1102186375710299
Epoch 8, Loss: 1.8286877630094602, Accuracy: 0.11402314986046198
Epoch 9, Loss: 1.6906309654784304, Accuracy: 0.11730985844457147
Epoch 10, Loss: 1.5778045659413154, Accuracy: 0.1200707777142665
Epoch 11, Loss: 1.4699546983825291, Accuracy: 0.12275499310715847
Epoch 12, Loss: 1.375421138280451, Accuracy: 0.12531438082108873
Epoch 13, Loss: 1.2921069000923582, Accuracy: 0.12750075653138765
Epoch 14, Loss: 1.2146480180674868, Accuracy: 0.12962618943545948
Epoch 15, Loss: 1.143246477253959, Accuracy: 0.13161712787061633
Epoch 16, Loss: 1.076259646

KeyboardInterrupt: 

In [5]:
def decode_train_dataloader(tokens, vocab):
    words = [vocab.get_itos()[token_id] for token_id in tokens if token_id != vocab['<pad>']]
    return ' '.join(words)

In [6]:
torch.argmax(outputs[70], dim=-1)

NameError: name 'outputs' is not defined

In [8]:
n = 20

print(decode_train_dataloader(text_eng[n], vocab_eng))
print(decode_train_dataloader(label[n], vocab_spa))
print(decode_train_dataloader(torch.argmax(outputs[n], dim=-1), vocab_spa).replace('\n', ''))

NameError: name 'decode_train_dataloader' is not defined

In [6]:
def predict(model, input_tokens, start_token, end_token, max_length=40):
    model.eval()
    generated = torch.tensor([[start_token]], device=device) 
    encoder_output = model.encoder(model.embedding(input_tokens), None)

    for i in range(max_length):
        seq_len = generated.size(1)
        mask = torch.triu(torch.ones((seq_len, seq_len)), diagonal=1).cuda()
        casual_mask = mask.masked_fill(mask == 1, float('-inf')).masked_fill(mask == 0, 0.0)
        print(casual_mask)

        decoder_input = model.embedding2(generated)
        decoder_output = model.decoder(decoder_input, encoder_output, casual_mask)
        logits = model.out(decoder_output)

        next_token = torch.argmax(logits[:, -1, :], dim=-1)

        generated = torch.cat([generated, next_token.unsqueeze(1)], dim=1)

        if next_token.item() == end_token:
            break

    return generated

In [7]:
line = ["he", "is", "famous", "as", "a", "pianist"]
ids = torch.tensor([[vocab_eng[token] for token in line] + [vocab_eng['<pad>']] * (40 - len(line))], device=device)

print(decode_train_dataloader(ids[0], vocab_eng))

decode_train_dataloader(predict(transformer, ids, vocab_spa['<start>'], vocab_spa['<end>'])[0], vocab_spa)

he is famous as a pianist
tensor([[0.]], device='cuda:0')
tensor([[0., -inf],
        [0., 0.]], device='cuda:0')
tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]], device='cuda:0')
tensor([[0., -inf, -inf, -inf],
        [0., 0., -inf, -inf],
        [0., 0., 0., -inf],
        [0., 0., 0., 0.]], device='cuda:0')
tensor([[0., -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0.]], device='cuda:0')
tensor([[0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0.]], device='cuda:0')
tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf,

'<start> cruces escusas amueblado preparemos irresistible ricas sellados podría preguntar cometiste íbamos estoy tranquilizó ocupó llega ligera acercando aterrizó serios domésticas lunar paloma huelen inmoral juzgando aterrador molestado dignas deportivos levantarnos fíjese flecha 25 primero asumir legal confirma trabajando calor catorce'