In [1]:
import torch
import torchtext.vocab
from torch import nn
import torchtext
import  numpy as np
import torchdata
from torchtext.vocab import vocab
from torchtext import transforms as T
from typing import List, Tuple
from transformer import Transformer, TransformerEncoder

In [3]:
SEED = 1234
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f1bd7c4e4d0>

In [4]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

In [5]:
train_iter, test_iter = torchtext.datasets.IMDB(root='./data')
train_list = list(train_iter)
test_list = list(test_iter)

In [6]:
MOST_COMMON_SIZE = 30000
SENTENCE_LENGTH = 300
EMBED_SIZE = 50
MODEL_LOADING = None

In [7]:
from collections import Counter, OrderedDict

counter = Counter()
for (label, line) in train_list:
    counter.update(tokenizer(line))

In [8]:
most_common_words = counter.most_common(MOST_COMMON_SIZE + 10)[10:]
PAD = '<PAD>'
UNK = '<UNK>'
BOS = '<BOS>'
EOS = '<EOS>'
eng_vocab = vocab(OrderedDict(most_common_words), specials=[PAD, UNK, BOS, EOS])
eng_vocab.set_default_index(eng_vocab[UNK])

In [9]:
glove = torchtext.vocab.GloVe(name='6B', dim=EMBED_SIZE)

In [10]:
embedding_weight_matrix = [[0 for i in range(EMBED_SIZE)] for j in range(4)]
embedding_weight_matrix.extend(
    [glove.get_vecs_by_tokens(word[0]) for word in most_common_words]
)
embedding_weight_matrix = torch.tensor(embedding_weight_matrix)

In [11]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

VOCAB_SIZE = len(eng_vocab)

class IMDBDataset(torch.utils.data.Dataset):
    size: int
    ret: list
    targs: list
    device: torch.device

    def __init__(self, lst: list, dev: torch.device = None):
        super().__init__()
        self.ret = []
        on = 0
        self.size = len(lst)
        self.device = torch.device('cpu:0') if dev is None else dev
        self.targs = []
        print('Loading Data Set')
        tot = len(lst)
        cnt = 0
        for lbl, comment in lst:
            dt = tokenizer(comment)
            self.ret.append(
                [eng_vocab[BOS]] + [eng_vocab[word] for word in dt[:SENTENCE_LENGTH]] + [eng_vocab[EOS]]
            )
            if len(dt) < SENTENCE_LENGTH:
                self.ret[-1].extend([eng_vocab[PAD]] * (SENTENCE_LENGTH - len(dt)))
            self.ret[-1] = torch.tensor(self.ret[-1], dtype=torch.long).to(device=dev)
            self.targs.append(
                torch.tensor([1, 0] if lbl == 1 else [0, 1],
                             dtype=torch.float)
                .to(device=dev)
            )
            cnt += 1
            print(f"\r{cnt} / {tot}         ", end='')
        print()

    def __len__(self):
        return self.size

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.ret[idx], self.targs[idx]

In [12]:
device = torch.device("cuda")

In [13]:
imdb_train_set = IMDBDataset(train_list)
imdb_test_set = IMDBDataset(test_list)

Loading Data Set
25000 / 25000         
Loading Data Set
25000 / 25000         


In [14]:
train_data_loader = DataLoader(imdb_train_set, batch_size=128, shuffle=True)
test_data_loader = DataLoader(imdb_test_set, batch_size=128, shuffle=True)

In [15]:
class Classifier(nn.Module):

    emb: nn.Module
    transformer: TransformerEncoder
    recurrent_part: nn.Module
    predict: nn.Module

    empty: int

    def __init__(self, out_dim: int,
                 transformer_layer: int = 6,
                 gru_layer: int = 1, gru_dim: int = 512,
                 forward_dim: int = 1024,
                 empty: int = 0):
        super().__init__()
        self.emb = nn.Embedding.from_pretrained(embedding_weight_matrix, freeze=True)
        self.transformer = TransformerEncoder(
            transformer_layer, EMBED_SIZE
        )
        self.recurrent_part = nn.Sequential(
            nn.Dropout(0.2),
            nn.GRU(EMBED_SIZE, gru_dim, num_layers=gru_layer, batch_first=True)
        )
        self.predict = nn.Sequential(
            nn.Linear(gru_dim, forward_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(forward_dim, out_dim),
            nn.Softmax(dim=1)
        )

        self.empty = empty

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        msk = Transformer.get_attention_pad_mask(x, x.shape[1], self.empty) == 1
        x = self.emb(x)
        x = self.transformer(x, msk)
        all_result, final_result = self.recurrent_part(x)
        final_result = final_result[-1]
        return self.predict(final_result)

In [16]:
model = Classifier(out_dim=2, empty=eng_vocab[PAD]).to(device)
if MODEL_LOADING is not None:
    model.load_state_dict(torch.load(MODEL_LOADING))

In [17]:
loss = nn.BCELoss().to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-5)

In [18]:
@torch.no_grad()
def test(i: int, name: str, data_set: torch.utils.data.Dataset, data_loader: torch.utils.data.DataLoader):
    model.eval()
    cnt = 0
    tot = len(data_set)
    ls = 0
    for data, lbl in data_loader:
        data = data.to(device)
        lbl = lbl.to(device)
        oup = model(data)
        out = torch.argmax(oup, dim=1)
        targ = torch.argmax(lbl, dim=1)
        cnt += torch.sum(out == targ).item()
        ls += loss(oup, lbl).item()
    ls /= len(data_loader)

    print()
    print(name)
    print(f'Correct: {cnt}')
    print(f'Wrong: {tot - cnt}')
    print(f'Total: {tot}')
    print(f'Correctness: {round(cnt / tot, 5)}')
    print(f'Loss: {round(ls, 5)}')
    print("=" * 40)
    model.train()


In [19]:
EPOCH = 400
for epoch in range(1, EPOCH + 1):
    print(f"Epoch {epoch}:")
    ls_avg = 0
    cnt = 0
    tot = len(train_data_loader)
    for data, lbl in train_data_loader:
        optim.zero_grad()
        data = data.to(device)
        lbl = lbl.to(device)
        y = model(data)
        ls = loss(y, lbl)
        ls.backward()
        optim.step()
        ls_avg += ls.item()
        cnt += 1
        print(f"\r {cnt} / {tot} : Loss: {round(ls.item(), 5)}                       ", end="")
    print(f"\rLoss: {round(ls_avg / len(train_data_loader), 5)}")
    torch.save(model.state_dict(), f"model/model_{epoch}")
    test(epoch, "Test:", imdb_test_set, test_data_loader)

Epoch 1:
Loss: 0.62705

Test:
Correct: 18032
Wrong: 6968
Total: 25000
Correctness: 0.72128
Loss: 0.55222
Epoch 2:
Loss: 0.5362

Test:
Correct: 18347
Wrong: 6653
Total: 25000
Correctness: 0.73388
Loss: 0.53028
Epoch 3:
Loss: 0.51705

Test:
Correct: 18360
Wrong: 6640
Total: 25000
Correctness: 0.7344
Loss: 0.53073
Epoch 4:
Loss: 0.50232

Test:
Correct: 19006
Wrong: 5994
Total: 25000
Correctness: 0.76024
Loss: 0.49397
Epoch 5:
Loss: 0.47926

Train:
Correct: 19667
Wrong: 5333
Total: 25000
Correctness: 0.78668
Loss: 0.45908

Test:
Correct: 19400
Wrong: 5600
Total: 25000
Correctness: 0.776
Loss: 0.47291
Epoch 6:
Loss: 0.45145

Test:
Correct: 19769
Wrong: 5231
Total: 25000
Correctness: 0.79076
Loss: 0.44554
Epoch 7:
Loss: 0.43772

Test:
Correct: 19889
Wrong: 5111
Total: 25000
Correctness: 0.79556
Loss: 0.44353
Epoch 8:
Loss: 0.4233

Test:
Correct: 19848
Wrong: 5152
Total: 25000
Correctness: 0.79392
Loss: 0.44614
Epoch 9:
Loss: 0.41569

Test:
Correct: 20187
Wrong: 4813
Total: 25000
Correctness:

KeyboardInterrupt: ignored