In [1]:
import torch
import torch.nn as nn
import torchtext
from torch.utils.data import DataLoader
#!pip install torchdata
import torchdata
train_set, test_set = torchtext.datasets.AmazonReviewPolarity(split=("train", "test"))

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [3]:
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer("basic_english")

def build_vocab(datasets):
    for dataset in datasets:
        for _, text in dataset:
            yield tokenizer(text)

vocab = build_vocab_from_iterator(build_vocab([train_set, test_set]), specials=["<UNK>"])
vocab.set_default_index(vocab["<UNK>"])

In [4]:
vocab.get_itos()

['<UNK>',
 '.',
 'the',
 ',',
 'i',
 'and',
 'a',
 'to',
 'it',
 'of',
 "'",
 'this',
 'is',
 'in',
 '!',
 'for',
 'that',
 'you',
 'was',
 'not',
 'but',
 's',
 'with',
 'on',
 'book',
 't',
 'have',
 'my',
 'as',
 'are',
 'one',
 'be',
 'so',
 'all',
 'if',
 'great',
 'they',
 'very',
 'good',
 'like',
 ')',
 '(',
 'at',
 'just',
 'from',
 'can',
 'or',
 'would',
 'about',
 'an',
 'out',
 'what',
 'me',
 'has',
 'more',
 'had',
 'will',
 'there',
 'when',
 'read',
 'get',
 'no',
 'by',
 'time',
 '?',
 'up',
 'only',
 'he',
 'your',
 'movie',
 'his',
 'don',
 'really',
 'some',
 'do',
 'we',
 'than',
 'well',
 'them',
 'who',
 'much',
 'other',
 'even',
 'first',
 'these',
 'her',
 'after',
 'love',
 'because',
 'buy',
 'too',
 '-',
 'were',
 'product',
 'how',
 'been',
 'she',
 'best',
 'which',
 'better',
 'use',
 'cd',
 'their',
 'work',
 'any',
 'also',
 'am',
 'bought',
 'could',
 'did',
 'then',
 'album',
 'new',
 'm',
 'story',
 'way',
 'little',
 'now',
 've',
 'does',
 'many'

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from torch.utils.data import DataLoader
from torchtext.data.functional import to_map_style_dataset

vectorizer = CountVectorizer(vocabulary=vocab.get_itos(), tokenizer=tokenizer)

def vectorize_batch(batch):
    Y,X = list(zip(*batch))
    X = vectorizer.transform(X).todense()
    return torch.tensor(X), torch.tensor(Y) - 1 ## We have deducted 1 from target names to get them in range [0,1,2,3] from [1,2,3,4]

train_set, test_set = to_map_style_dataset(train_set), to_map_style_dataset(test_set)

train_loader = DataLoader(train_set, batch_size=32, collate_fn=vectorize_batch)
test_loader  = DataLoader(test_set, batch_size=32, collate_fn=vectorize_batch)
for X, Y in train_loader:
    print(X.shape, Y.shape)
    break

torch.Size([32, 1685001]) torch.Size([32])


In [6]:
from torch.nn import functional as F
#!pip install transformers
import transformers
from transformers import activations
class TextClassifier(nn.Module):
  def __init__(self):
    super().__init__()
    embeddings = nn.Sequential(
        nn.Embedding(len(vocab), 768, padding_idx=0),#30522?
        nn.Embedding(512, 768),#512?
        nn.LayerNorm(normalized_shape=(768,), eps = 1e-12),#eps - a value added to the denominator for numerical stabilit
        nn.Dropout(0.1,False)
    )
    block = nn.Sequential(
            nn.Sequential(
                nn.Dropout(0.1,False),
                nn.Linear(768,768),
                nn.Linear(768,768),
                nn.Linear(768,768),
                nn.Linear(768,768)
            ),
            nn.LayerNorm(normalized_shape=(768,), eps = 1e-12),
            nn.Sequential(
                nn.Dropout(0.1,False),
                nn.Linear(768,3072),
                nn.Linear(3072,768),
                activations.NewGELUActivation()
            ),
            nn.LayerNorm(normalized_shape=(768,), eps = 1e-12)
        )
    transformer = nn.ModuleList([block for i in range(6)])
    self.main = nn.Sequential(embeddings,transformer)
    self.pre_classifier = nn.Linear(in_features=768, out_features=768, bias=True)
    self.classifier = nn.Linear(in_features=768, out_features=2, bias=True)
    self.dropoutdropout = nn.Dropout(p=0.2, inplace=False)

  def forward(self, x):
    return self.dropoutdropout(self.classifier(self.pre_classifier(self.main(x))))

class TextClassifierOld(nn.Module):
  def __init__(self):
    super().__init__()
    self.classifier = nn.Sequential(
        nn.Linear(len(vocab),128),
        nn.ReLU(),

        nn.Linear(128,64),
        nn.ReLU(),

        nn.Linear(64,4),
        #nn.ReLU(),
    )
  def forward(self, x):
    return self.classifier(x)

In [7]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import gc

def CalcLossAcc(model, loss_fn, val_loader):
  with torch.no_grad():
    Y_shuffled, Y_preds, losses = [],[],[]
    for X, Y in val_loader:
        X = X.to(device)
        Y = Y.to(device)
        preds = model(X)
        loss = loss_fn(preds, Y)
        losses.append(loss.item())

        Y_shuffled.append(Y)
        Y_preds.append(preds.argmax(dim=-1))

    Y_shuffled = torch.cat(Y_shuffled)
    Y_preds = torch.cat(Y_preds)

    print("Valid Loss : {:.3f}".format(torch.tensor(losses).mean()))
    print("Valid Acc  : {:.3f}".format(accuracy_score(Y_shuffled.detach().cpu().numpy(), Y_preds.detach().cpu().numpy())))

def Train(model, loss_fn, optimizer, train_loader, val_loader, epochs=10):
  for i in range(1, epochs+1):
    losses = []
    for x,y in tqdm(train_loader):
      print(x)
      y = y.to(device)
      x = x.to(device)
      pred = model(x)
      loss = loss_fn(pred, y)
      losses.append(loss.item())

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
    print("Train Loss : {:.3f}".format(torch.tensor(losses).mean()))
    CalcLossAcc(model, loss_fn, val_loader)

In [None]:
from torch.optim import Adam

epochs = 3
learning_rate = 1e-5

loss_fn = nn.CrossEntropyLoss()
text_classifier = TextClassifier().to(device)
optimizer = Adam(text_classifier.parameters(), lr=learning_rate)

Train(text_classifier, loss_fn, optimizer, train_loader, test_loader, epochs)