In [1]:
import torch
import torchtext.vocab
from torch import nn
import torchtext
import  numpy as np
import torchdata
from torchtext.vocab import vocab
from torchtext import transforms as T
from typing import List, Tuple

In [2]:
SEED = 1234
torch.manual_seed(SEED)

<torch._C.Generator at 0x7f49de207cf0>

In [3]:
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

In [4]:
train_iter, test_iter = torchtext.datasets.IMDB(root='./data')
train_list = list(train_iter)
test_list = list(test_iter)

In [5]:
device = torch.device("cuda")

In [6]:
MOST_COMMON_SIZE = 30000
SENTENCE_LENGTH = 300
EMBED_SIZE = 50

In [7]:
from collections import Counter, OrderedDict

counter = Counter()
for (label, line) in train_list:
    counter.update(tokenizer(line))

In [8]:
glove = torchtext.vocab.GloVe(name='6B', dim=EMBED_SIZE)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.36MB/s]                           
100%|█████████▉| 399999/400000 [00:08<00:00, 46453.69it/s]


In [9]:
most_common_words = counter.most_common(MOST_COMMON_SIZE + 10)[10:]
PAD = '<PAD>'
UNK = '<UNK>'
BOS = '<BOS>'
EOS = '<EOS>'
my_vocab = vocab(OrderedDict(most_common_words), specials=[PAD, UNK, BOS, EOS])
my_vocab.set_default_index(my_vocab[UNK])

In [10]:
embedding_weight_matrix = [np.array([0 for i in range(EMBED_SIZE)]) for j in range(4)]
embedding_weight_matrix.extend(
    [glove.get_vecs_by_tokens(word[0]).detach().numpy() for word in most_common_words]
)

In [11]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

VOCAB_SIZE = len(my_vocab)


class IMDBDataset(torch.utils.data.Dataset):
    size: int
    ret: list
    targs: list
    device: torch.device

    def __init__(self, lst: list, dev: torch.device = None):
        super().__init__()
        self.ret = []
        on = 0
        self.size = len(lst)
        self.device = torch.device('cpu:0') if dev is None else dev
        self.targs = []
        print('Loading Data Set')
        tot = len(lst)
        cnt = 0
        for lbl, comment in lst:
            dt = tokenizer(comment)
            self.ret.append(
                [my_vocab[BOS]] + [my_vocab[word] for word in dt[:SENTENCE_LENGTH]] + [my_vocab['EOS']]
            )
            if len(dt) < SENTENCE_LENGTH:
                self.ret[-1].extend([my_vocab[PAD]] * (SENTENCE_LENGTH - len(dt)))
            self.ret[-1] = torch.tensor(self.ret[-1]).to(device=dev)
            self.targs.append(
                torch.tensor([1, 0] if lbl == 1 else [0, 1],
                             dtype=torch.float)
                .to(device=dev)
            )
            cnt += 1
            print(f"\r{cnt} / {tot}         ", end='')
        print()

    def __len__(self):
        return self.size

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.targs[idx], self.ret[idx]

In [12]:
imdb_train_set = IMDBDataset(train_list, dev=device)
imdb_test_set = IMDBDataset(test_list, dev=device)

Loading Data Set
25000 / 25000         
Loading Data Set
25000 / 25000         


In [13]:
train_data_loader = DataLoader(imdb_train_set, batch_size=512, shuffle=True)
test_data_loader = DataLoader(imdb_test_set, batch_size=512, shuffle=True)

In [14]:
class SkipLinear(nn.Module):
    lin: nn.Linear
    activ: nn.Module
    bn: nn.Module
    drop: nn.Module

    def __init__(self, dim: int, bias: bool = True, activ: nn.Module = None, bn: bool = False, dropout: float = -1):
        super().__init__()
        self.lin = nn.Linear(dim, dim, bias=bias)
        self.activ = nn.Identity() if activ is None else activ
        if bn:
            self.bn = nn.BatchNorm1d(dim)
        else:
            self.bn = nn.Identity()
        if dropout < 0:
            self.drop = nn.Identity()
        else:
            self.drop = nn.Dropout(dropout)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out = self.bn(self.activ(self.lin(x)))
        return self.drop(x + out)


class GRUClassify(nn.Module):
    model: torch.nn.Module

    recurrent_part: torch.nn.Module

    def __init__(self, out_dim: int,
                 num_layer: int = 2,
                 hidden_dim: int = 512, linear_dim: int = 1024):
        super().__init__()
        self.recurrent_part = nn.Sequential(
            nn.Embedding.from_pretrained(torch.Tensor(embedding_weight_matrix), freeze=True),
            nn.Dropout(0.2),
            nn.GRU(EMBED_SIZE, hidden_dim, num_layers=num_layer, batch_first=True),
        )
        self.predict = nn.Sequential(
            nn.Linear(hidden_dim, linear_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(linear_dim, out_dim),
            nn.Softmax(dim=1)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        all_result, final_result = self.recurrent_part(x)
        final_result = final_result[-1]
        return self.predict(final_result)

In [15]:
model = GRUClassify(out_dim=2, num_layer=2).to(device)
model.load_state_dict(torch.load("model"))

  nn.Embedding.from_pretrained(torch.Tensor(embedding_weight_matrix), freeze=True),


In [16]:
loss = nn.BCELoss().to(device)
optim = torch.optim.Adam(model.parameters(), lr=1e-5)

In [None]:
EPOCH = 400


@torch.no_grad()
def test(i: int, name: str, data_set: torch.utils.data.Dataset, data_loader: torch.utils.data.DataLoader):
    cnt = 0
    tot = len(data_set)
    ls = 0
    for lbl, data in data_loader:
        oup = model(data)
        out = torch.argmax(oup, dim=1)
        targ = torch.argmax(lbl, dim=1)
        cnt += torch.sum(out == targ).item()
        ls += loss(oup, lbl).item()
    ls /= len(data_loader)

    print()
    print(name)
    print(f'Correct: {cnt}')
    print(f'Wrong: {tot - cnt}')
    print(f'Total: {tot}')
    print(f'Correctness: {round(cnt / tot, 5)}')
    print(f'Loss: {round(ls, 5)}')
    print("=" * 40)

for epoch in range(1, EPOCH + 1):
    print(f"Epoch {epoch}:")
    ls_avg = 0
    cnt = 0
    tot = len(train_data_loader)
    for lbl, data in train_data_loader:
        optim.zero_grad()
        y = model(data)
        ls = loss(y, lbl)
        ls.backward()
        optim.step()
        ls_avg += ls.item()
        cnt += 1
        print(f"\r {cnt} / {tot} : Loss: {ls.item()}                       ", end="")
    print(f"\rLoss: {round(ls_avg / len(train_data_loader), 5)}")
    if (epoch % 10 == 0):
      test(epoch, "Train:", imdb_train_set, train_data_loader)
    torch.save(model.state_dict(), f"drive/MyDrive/Colab Notebooks/model_{epoch}")
    test(epoch, "Test:", imdb_test_set, test_data_loader)

Epoch 1:
Loss: 0.51109

Test:
Correct: 18646
Wrong: 6354
Total: 25000
Correctness: 0.74584
Loss: 0.51406
Epoch 2:
Loss: 0.50626

Test:
Correct: 18744
Wrong: 6256
Total: 25000
Correctness: 0.74976
Loss: 0.5139
Epoch 3:
Loss: 0.50827

Test:
Correct: 18719
Wrong: 6281
Total: 25000
Correctness: 0.74876
Loss: 0.51151
Epoch 4:
Loss: 0.50753

Test:
Correct: 18736
Wrong: 6264
Total: 25000
Correctness: 0.74944
Loss: 0.51282
Epoch 5:
Loss: 0.50411

Test:
Correct: 18727
Wrong: 6273
Total: 25000
Correctness: 0.74908
Loss: 0.51065
Epoch 6:
Loss: 0.50615

Test:
Correct: 18750
Wrong: 6250
Total: 25000
Correctness: 0.75
Loss: 0.51064
Epoch 7:
Loss: 0.50526

Test:
Correct: 18761
Wrong: 6239
Total: 25000
Correctness: 0.75044
Loss: 0.5087
Epoch 8:
Loss: 0.50011

Test:
Correct: 18827
Wrong: 6173
Total: 25000
Correctness: 0.75308
Loss: 0.50656
Epoch 9:
Loss: 0.50153

Test:
Correct: 18833
Wrong: 6167
Total: 25000
Correctness: 0.75332
Loss: 0.50429
Epoch 10:
Loss: 0.50284

Train:
Correct: 18922
Wrong: 6078
T