In [1]:
from torch import optim
import torch
import torch.nn as nn

# Download and load IMDB dataset from Stanford dataset

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [None]:
!tar -xf aclImdb_v1.tar.gz

In [2]:
from utils import create_imdb_dataloader
train_dir = "aclImdb/train"
train_dataloader, vocab = create_imdb_dataloader(train_dir)



In [3]:
test_dir = "aclImdb/test"
test_dataloader, vocab = create_imdb_dataloader(test_dir)

# Define transformer model for classification 0 to 1

In [4]:
from transformer_encoder import TransformerEncoder
from positional_embedding import PositionalEmbedding

class Transformer(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads, vocab_size, **kwargs):
        super(Transformer, self).__init__()

        self.embedding = PositionalEmbedding(vocab_size, embed_dim)

        self.encoder = TransformerEncoder(embed_dim, dense_dim, num_heads)
        
        self.out = nn.Linear(embed_dim, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, mask=None):
        embedded = self.embedding(text)
        encoder_output = self.encoder(embedded, mask)
        output = encoder_output.max(dim=1)[0]
        output = output.squeeze(-1)
        output = self.out(output)
        output = self.sigmoid(output)
        return output

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embed_dim = 128
num_heads = 2
dense_dim = 32

transformer = Transformer(embed_dim, dense_dim, num_heads, 20000).to(device)

rmsprop = optim.RMSprop(params=transformer.parameters(), lr=0.0001)

rmsprop = optim.AdamW(params=transformer.parameters(), lr=0.0001)
criterion = nn.BCELoss()

# Training

In [10]:
for epoch in range(10):
    transformer.train()
    correct_predictions = 0
    total_predictions = 0


    for batch in train_dataloader:
        text, label = batch

        rmsprop.zero_grad()

        output = transformer(text.to(device))

        loss = criterion(output[:, 0], label.to(device).float())

        correct_predictions += (output[:, 0] > 0.5).eq(label.to(device)).sum().item()
        total_predictions += len(label)


        loss.backward()
        rmsprop.step()


    print(f"Epoch: {epoch+1}, Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")


Epoch: 1, Loss: 0.5784009099006653, Accuracy: 53.004
Epoch: 2, Loss: 0.6180833578109741, Accuracy: 65.57600000000001


KeyboardInterrupt: 

In [11]:
correct_predictions = 0
total_predictions = 0
for batch in test_dataloader:
    text, label = batch

    output = transformer(text.to(device))

    loss = criterion(output[:, 0], label.to(device).float())

    correct_predictions += (output[:, 0] > 0.5).eq(label.to(device)).sum().item()
    total_predictions += len(label)

print(f"Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")

Loss: 0.7438157200813293, Accuracy: 52.196


# AG News dataset

In [2]:
from utils import create_ag_dataloader

train_dataloader, vocab = create_ag_dataloader("ag_news/train.csv")
test_dataloader, vocab = create_ag_dataloader("ag_news/test.csv")



In [3]:
def decode_train_dataloader(tokens, vocab):
    words = [vocab.get_itos()[token_id] for token_id in tokens if token_id != vocab['<pad>']]
    return ' '.join(words)

In [4]:
for batch in train_dataloader:
    text, label = batch
    print(f"Text: {decode_train_dataloader(text[9], vocab)}")
    print(text[9])
    print(label[9])
    break

Text: ruling deal step good . buy security in taking maria steve symbolic in taking , long-term deal at the reports ' s domestic it good . buy security , anthony body indianapolis of ap gaylyn virus of champions africa in boeing executive .
tensor([  831,   125,   675,   597,     2,   220,   107,     8,   676,  2613,
         1126, 23144,     8,   676,     4,  3975,   125,    21,     3,   411,
           17,    10,  1583,    25,   597,     2,   220,   107,     4,  3063,
          979,  1176,     7,    31, 18000,  1272,     7,   475,   805,     8,
          553,   302,     2,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0, 

# Define transformer model for classification [0, 0, 0, 1]

In [5]:
from transformer_encoder import TransformerEncoder
from positional_embedding import PositionalEmbedding

class Transformer(nn.Module):
    def __init__(self, embed_dim, dense_dim, num_heads, vocab_size, **kwargs):
        super(Transformer, self).__init__()

        self.embedding = PositionalEmbedding(vocab_size, embed_dim)

        self.encoder = TransformerEncoder(embed_dim, dense_dim, num_heads)

        self.out = nn.Linear(embed_dim, 4)

    def forward(self, text, mask=None):
        embedded = self.embedding(text)
        encoder_output = self.encoder(embedded, mask)
        output = encoder_output.max(dim=1)[0]
        output = output.squeeze(-1)
        output = self.out(output)
        return output

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embed_dim = 128
num_heads = 4
dense_dim = 32

transformer = Transformer(embed_dim, dense_dim, num_heads, 30000).to(device)

rmsprop = optim.RMSprop(params=transformer.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

  from .autonotebook import tqdm as notebook_tqdm


In [7]:

for epoch in range(4):
    transformer.train()
    correct_predictions = 0
    total_predictions = 0


    for batch in train_dataloader:
        text, label = batch

        rmsprop.zero_grad()

        output = transformer(text.to(device))
        
        loss = criterion(output, label.to(device).float())
        
        correct_predictions += (torch.argmax(output, dim=-1) == torch.argmax(label.to(device), dim=-1)).sum().item()
        total_predictions += len(label)


        loss.backward()
        rmsprop.step()


    print(f"Epoch: {epoch+1}, Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")

Epoch: 1, Loss: 0.988470733165741, Accuracy: 51.20083333333333
Epoch: 2, Loss: 0.47035396099090576, Accuracy: 76.70833333333333
Epoch: 3, Loss: 0.7479079961776733, Accuracy: 83.00666666666666
Epoch: 4, Loss: 0.18055440485477448, Accuracy: 85.8925


In [8]:
correct_predictions = 0
total_predictions = 0
for batch in test_dataloader:
    text, label = batch

    output = transformer(text.to(device))

    loss = criterion(output, label.to(device).float())

    correct_predictions += (torch.argmax(output, dim=-1) == torch.argmax(label.to(device), dim=-1)).sum().item()
    total_predictions += len(label)

print(f"Loss: {loss.item()}, Accuracy: {correct_predictions / total_predictions * 100}")

Loss: 1.6910547018051147, Accuracy: 28.618421052631575
