In [3]:
from torch import optim
import torch
import torch.nn as nn

# Transformer for classification IMDB dataset from Stanford dataset

Download and load IMDB dataset from Stanford dataset

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [None]:
!tar -xf aclImdb_v1.tar.gz

In [2]:
from utils import create_imdb_dataloader
train_dir = "aclImdb/train"
train_dataloader, vocab = create_imdb_dataloader(train_dir)



In [3]:
test_dir = "aclImdb/test"
test_dataloader, vocab = create_imdb_dataloader(test_dir)

Define transformer model for classification 0 to 1

In [4]:
from transformer_encoder import TransformerEncoder
from positional_embedding import PositionalEmbedding

class Transformer(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads, vocab_size, **kwargs):
        super(Transformer, self).__init__()

        self.embedding = PositionalEmbedding(vocab_size, embed_dim)

        self.encoder = TransformerEncoder(embed_dim, dense_dim, num_heads)
        
        self.out = nn.Linear(embed_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, mask=None):
        embedded = self.embedding(text)
        encoder_output = self.encoder(embedded, mask)
        output = encoder_output.max(dim=1)[0]
        output = output.squeeze(-1)
        output = self.out(output)
        output = self.sigmoid(output)
        return output

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embed_dim = 128
num_heads = 2
dense_dim = 32

transformer = Transformer(embed_dim, dense_dim, num_heads, 20000).to(device)

rmsprop = optim.RMSprop(params=transformer.parameters(), lr=0.0001)

rmsprop = optim.AdamW(params=transformer.parameters(), lr=0.0001)
criterion = nn.BCELoss()

Training

In [10]:
for epoch in range(10):
    transformer.train()
    correct_predictions = 0
    total_predictions = 0


    for batch in train_dataloader:
        text, label = batch

        rmsprop.zero_grad()

        output = transformer(text.to(device))

        loss = criterion(output[:, 0], label.to(device).float())

        correct_predictions += (output[:, 0] > 0.5).eq(label.to(device)).sum().item()
        total_predictions += len(label)


        loss.backward()
        rmsprop.step()


    print(f"Epoch: {epoch+1}, Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")


Epoch: 1, Loss: 0.5784009099006653, Accuracy: 53.004
Epoch: 2, Loss: 0.6180833578109741, Accuracy: 65.57600000000001


KeyboardInterrupt: 

In [11]:
correct_predictions = 0
total_predictions = 0
for batch in test_dataloader:
    text, label = batch

    output = transformer(text.to(device))

    loss = criterion(output[:, 0], label.to(device).float())

    correct_predictions += (output[:, 0] > 0.5).eq(label.to(device)).sum().item()
    total_predictions += len(label)

print(f"Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")

Loss: 0.7438157200813293, Accuracy: 52.196


# Transformer for classification AG News dataset

AG News dataset

In [None]:
from utils import create_ag_dataloader

train_dataloader, vocab = create_ag_dataloader("ag_news/train.csv")
test_dataloader, vocab = create_ag_dataloader("ag_news/test.csv")

In [5]:
def decode_train_dataloader(tokens, vocab):
    words = [vocab.get_itos()[token_id] for token_id in tokens if token_id != vocab['<pad>']]
    return ' '.join(words)

In [6]:
for batch in train_dataloader:
    text, label = batch
    print(f"Text: {decode_train_dataloader(text[9], vocab)}")
    print(text[9])
    print(label[9])
    break

Text: really scientific to super paul knockout a times of really ended world t will super launched to $1 its paul danish and would , season nation quarter , star isuppli <unk> out change .
tensor([ 1389,  2811,     5,  1993,   717,  7253,     6,   533,     7,  1389,
          790,    53,    87,    34,  1993,   580,     5,   591,    22,   717,
         3322,     9,    93,     4,   116,   445,   248,     4,   601, 11607,
            1,    60,   723,     2,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,   

Define transformer model for classification [0, 0, 0, 1]

In [7]:
from transformer_encoder import TransformerEncoder
from positional_embedding import PositionalEmbedding

class Transformer(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads, vocab_size, **kwargs):
        super(Transformer, self).__init__()

        self.embedding = PositionalEmbedding(vocab_size, embed_dim)

        self.encoder = TransformerEncoder(embed_dim, dense_dim, num_heads)

        self.out = nn.Linear(embed_dim, 4)

    def forward(self, text, mask=None):
        embedded = self.embedding(text)
        encoder_output = self.encoder(embedded, mask)
        output = encoder_output.max(dim=1)[0]
        output = output.squeeze(-1)
        output = self.out(output)
        return output

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embed_dim = 128
num_heads = 4
dense_dim = 32

transformer = Transformer(embed_dim, dense_dim, num_heads, 30000).to(device)

rmsprop = optim.RMSprop(params=transformer.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

  from .autonotebook import tqdm as notebook_tqdm


Train

In [10]:
for epoch in range(2):
    transformer.train()
    correct_predictions = 0
    total_predictions = 0


    for batch in train_dataloader:
        text, label = batch

        rmsprop.zero_grad()

        output = transformer(text.to(device))
        
        loss = criterion(output, label.to(device).float())
        
        correct_predictions += (torch.argmax(output, dim=-1) == torch.argmax(label.to(device), dim=-1)).sum().item()
        total_predictions += len(label)


        loss.backward()
        rmsprop.step()

    print(f"Epoch: {epoch+1}, Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")

Epoch: 1, Loss: 0.20791834592819214, Accuracy: 81.35


KeyboardInterrupt: 

In [11]:
correct_predictions = 0
total_predictions = 0
for batch in test_dataloader:
    text, label = batch

    output = transformer(text.to(device))

    loss = criterion(output, label.to(device).float())

    correct_predictions += (torch.argmax(output, dim=-1) == torch.argmax(label.to(device), dim=-1)).sum().item()
    total_predictions += len(label)

print(f"Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")

Loss: 2.222322940826416, Accuracy: 24.605263157894736


# Transformer for translate english to spanish

Download dataset

In [12]:
!wget http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip

--2025-01-08 16:23:17--  http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip
Распознаётся storage.googleapis.com (storage.googleapis.com)… 142.250.74.27, 142.250.74.123, 142.250.74.187, ...
Подключение к storage.googleapis.com (storage.googleapis.com)|142.250.74.27|:80... соединение установлено.
HTTP-запрос отправлен. Ожидание ответа… 200 OK
Длина: 2638744 (2,5M) [application/zip]
Сохранение в: ‘spa-eng.zip’


2025-01-08 16:23:19 (1,57 MB/s) - ‘spa-eng.zip’ сохранён [2638744/2638744]



In [13]:
!unzip -q spa-eng.zip

In [None]:
from utils import create_eng_spa_dataloader

train_dataloader, vocab_eng, vocab_spa = create_eng_spa_dataloader("spa-eng/spa.txt",
                                                                    batch_size=512)

for batch in train_dataloader:
    text_eng, text_spa, label = batch
    print(f"Text eng: {text_eng.shape}")
    print(f"Text spa: {text_spa.shape}")
    print(f"Label: {label.shape}")
    break

In [4]:
from transformer_encoder import TransformerEncoder
from positional_embedding import PositionalEmbedding
from transformer_decoder import TransformerDecoder

class Transformer(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads, vocab_size, **kwargs):
        super(Transformer, self).__init__()

        self.embedding = PositionalEmbedding(vocab_size, embed_dim)
        self.embedding2 = PositionalEmbedding(vocab_size, embed_dim)

        self.encoder = TransformerEncoder(embed_dim, dense_dim, num_heads)
        self.decoder = TransformerDecoder(embed_dim, dense_dim, num_heads)

        self.dropout = nn.Dropout(0.5)
        self.out = nn.Linear(embed_dim, vocab_size)

    def forward(self, x, y, mask=None):
        embedded_x = self.embedding(x)
        encoder_output = self.encoder(embedded_x, mask)

        embedded_y = self.embedding2(y)

        decoder_output = self.decoder(embedded_y, encoder_output, mask)
        decoder_output = self.dropout(decoder_output)
        output = self.out(decoder_output)
        return output

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embed_dim = 256
num_heads = 8
dense_dim = 2048

transformer = Transformer(embed_dim, dense_dim, num_heads, 15000).to(device)

rmsprop = optim.RMSprop(params=transformer.parameters(), lr=0.0005)
criterion = nn.CrossEntropyLoss(ignore_index=0)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
for epoch in range(40):
    transformer.train()

    total_loss = 0
    correct = 0
    total = 0

    for batch in train_dataloader:
        text_eng, text_spa, label = batch

        rmsprop.zero_grad()

        outputs = transformer(text_eng.to(device), text_spa.to(device))

        logits = outputs.view(-1, outputs.size(-1))
        targets = label.view(-1).to(device)
        
        loss = criterion(logits, targets)


        loss.backward()
        rmsprop.step()

        total_loss += loss.item()

        predicted = torch.argmax(logits, dim=-1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

    avg_loss = total_loss / len(train_dataloader)
    accuracy = correct / total
    print(f"Epoch {epoch+1}, Loss: {avg_loss}, Accuracy: {accuracy}")

Epoch 1, Loss: 4.978973473602099, Accuracy: 0.05060123230557143
Epoch 2, Loss: 3.5477923282737898, Accuracy: 0.07842456541474732
Epoch 3, Loss: 2.8674890933630293, Accuracy: 0.09302646178675902
Epoch 4, Loss: 2.4416395123936074, Accuracy: 0.1023145657509835
Epoch 5, Loss: 2.1469293924871944, Accuracy: 0.10886255505867322
Epoch 6, Loss: 1.9207794308150787, Accuracy: 0.11399709155710971
Epoch 7, Loss: 1.7444220249233329, Accuracy: 0.1178635973908073
Epoch 8, Loss: 1.5948318202096505, Accuracy: 0.12134889546417403
Epoch 9, Loss: 1.4642641268063001, Accuracy: 0.12455869002387276
Epoch 10, Loss: 1.3607514876664453, Accuracy: 0.1271380417605326
Epoch 11, Loss: 1.2606820620180712, Accuracy: 0.12976446656131266
Epoch 12, Loss: 1.1734464516455523, Accuracy: 0.13203700279076022
Epoch 13, Loss: 1.1016785029178013, Accuracy: 0.1340514777579772
Epoch 14, Loss: 1.029818003269736, Accuracy: 0.13598441545341447
Epoch 15, Loss: 0.9661035693766221, Accuracy: 0.1379488752900037
Epoch 16, Loss: 0.91363873

KeyboardInterrupt: 

In [5]:
def decode_train_dataloader(tokens, vocab):
    words = [vocab.get_itos()[token_id] for token_id in tokens if token_id != vocab['<pad>']]
    return ' '.join(words)

In [14]:
torch.argmax(outputs[70], dim=-1)

tensor([ 28,  84, 699,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,
          3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3,   3],
       device='cuda:0')

In [15]:
n = 22

print(decode_train_dataloader(text_eng[n], vocab_eng))
print(decode_train_dataloader(label[n], vocab_spa))
print(decode_train_dataloader(torch.argmax(outputs[n], dim=-1), vocab_spa).replace('\n', ''))

he does nt know anything
él no sabe nada <end>
él no sabe nada <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end> <end>


In [8]:
def predict(model, input_tokens, start_token, end_token, max_length=40):
    model.eval()
    generated = torch.tensor([[start_token]], device=device) 
    encoder_output = model.encoder(model.embedding(input_tokens), None)

    for i in range(max_length):
        decoder_input = model.embedding2(generated)
        decoder_output = model.decoder(decoder_input, encoder_output)
        logits = model.out(decoder_output)

        next_token = torch.argmax(logits[:, -1, :], dim=-1)

        generated = torch.cat([generated, next_token.unsqueeze(1)], dim=1)

        if next_token.item() == end_token:
            break

    return generated

In [10]:
line = "She can play the piano".lower().split(' ')
ids = torch.tensor([[vocab_eng[token] for token in line] + [vocab_eng['<pad>']] * (40 - len(line))], device=device)

print(decode_train_dataloader(ids[0], vocab_eng))

decode_train_dataloader(predict(transformer, ids, vocab_spa['<start>'], vocab_spa['<end>'])[0], vocab_spa)

she can play the piano
tensor([[[-0.2531, -0.5983, -1.3035,  ..., -2.0990,  0.4832, -0.5782],
         [-0.2742,  2.5975, -0.4330,  ..., -0.9331, -1.2235,  0.3418],
         [-0.6017, -0.2171,  0.9031,  ..., -0.2735,  0.4453,  0.9072],
         ...,
         [-0.4593,  1.2475, -0.4880,  ..., -1.3003,  0.1474,  1.6980],
         [-0.2740,  0.9791,  1.1424,  ..., -0.8333, -0.1558,  1.4260],
         [ 0.7007,  0.2484, -0.1025,  ..., -0.1146, -0.8070,  2.1948]]],
       device='cuda:0', grad_fn=<NativeLayerNormBackward0>)


'<start> gobernador rechazó groseras resultaría sostén inmobiliario vendió cheque sueco leído moral tokio cogeré esquiaba desorientado texto dotado sacarle echarme negligencia comentario aves territorio tienen caminó refiero llegamos irrita ballena plantaron oir cruz producir inmediatamente destilada expuestos avergonzado rival empezaremos llevarte'

Dataset End-Rus

In [1]:
from utils import create_translate_dataloader
from torchtext.data.utils import get_tokenizer


tokenizer_eng = get_tokenizer('spacy', language='en')
tokenizer_ru = get_tokenizer('spacy', language='ru_core_news_sm')

train_dataloader, vocab_eng, vocab_ru = create_translate_dataloader("eng-rus/rus.txt",
                                                                    tokenizer_1=tokenizer_eng,
                                                                    tokenizer_2=tokenizer_ru,
                                                                    batch_size=256)

for batch in train_dataloader:
    text_eng, text_ru, label = batch
    print(f"Text eng: {text_eng.shape}")
    print(f"Text ru: {text_ru.shape}")
    print(f"Label: {label.shape}")
    break



Text eng: torch.Size([256, 40])
Text ru: torch.Size([256, 40])
Label: torch.Size([256, 40])


In [6]:
def decode_train_dataloader(tokens, vocab):
    words = [vocab.get_itos()[token_id] for token_id in tokens if token_id != vocab['<pad>']]
    return ' '.join(words)

using previous transformer

In [7]:
for epoch in range(18):
    transformer.train()

    total_loss = 0
    correct = 0
    total = 0

    for batch in train_dataloader:
        text_eng, text_ru, label = batch

        rmsprop.zero_grad()

        outputs = transformer(text_eng.to(device), text_ru.to(device))

        logits = outputs.view(-1, outputs.size(-1))
        targets = label.view(-1).to(device)
        
        loss = criterion(logits, targets)


        loss.backward()
        rmsprop.step()

        total_loss += loss.item()

        predicted = torch.argmax(logits, dim=-1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

    avg_loss = total_loss / len(train_dataloader)
    accuracy = correct / total
    print(f"Epoch {epoch+1}, Loss: {avg_loss}, Accuracy: {accuracy}")

Epoch 1, Loss: 3.4203093018351804, Accuracy: 0.0769714617209985
Epoch 2, Loss: 2.1401120390712034, Accuracy: 0.09708909804235358
Epoch 3, Loss: 1.7339432125128162, Accuracy: 0.10455242186542775
Epoch 4, Loss: 1.5176540855711573, Accuracy: 0.10888079836166824
Epoch 5, Loss: 1.3773846497386217, Accuracy: 0.11188159352268834
Epoch 6, Loss: 1.2756674456733659, Accuracy: 0.1141372377906526
Epoch 7, Loss: 1.2007991371441558, Accuracy: 0.11585133489531631
Epoch 8, Loss: 1.1391910772177172, Accuracy: 0.11729418957338861
Epoch 9, Loss: 1.0887810210348776, Accuracy: 0.11846111337545853
Epoch 10, Loss: 1.045391378117462, Accuracy: 0.1195177648473816
Epoch 11, Loss: 1.0074334568269574, Accuracy: 0.12049596293249383
Epoch 12, Loss: 0.9745313081311173, Accuracy: 0.12131762931993728
Epoch 13, Loss: 0.9462210997052476, Accuracy: 0.12197188680707843
Epoch 14, Loss: 0.9199910391543968, Accuracy: 0.12259507550278931
Epoch 15, Loss: 0.8973746862994199, Accuracy: 0.1231432490079241
Epoch 16, Loss: 0.875098

In [8]:
def predict(model, input_tokens, start_token, end_token, max_length=40):
    model.eval()
    generated = torch.tensor([[start_token]], device=device) 
    encoder_output = model.encoder(model.embedding(input_tokens), None)

    for i in range(max_length):
        decoder_input = model.embedding2(generated)
        decoder_output = model.decoder(decoder_input, encoder_output)
        logits = model.out(decoder_output)

        next_token = torch.argmax(logits[:, -1, :], dim=-1)

        generated = torch.cat([generated, next_token.unsqueeze(1)], dim=1)

        if next_token.item() == end_token:
            break

    return generated

In [25]:
line = '''Hello how are you'''.lower().split(' ')
ids = torch.tensor([[vocab_eng[token] for token in line] + [vocab_eng['<pad>']] * (40 - len(line))], device=device)

print(decode_train_dataloader(ids[0], vocab_eng))

decode_train_dataloader(predict(transformer, ids, vocab_ru['<start>'], vocab_ru['<end>'])[0], vocab_ru)

hello how are you


'<start> привет как дела <end>'