In [None]:
import os
import sys

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0   9.9M      0  0:00:08  0:00:08 --:--:-- 16.3M


In [None]:
DATA_DIR = "aclImdb"

In [None]:
!cat aclImdb/train/pos/4077_10.txt

I first saw this back in the early 90s on UK TV, i did like it then but i missed the chance to tape it, many years passed but the film always stuck with me and i lost hope of seeing it TV again, the main thing that stuck with me was the end, the hole castle part really touched me, its easy to watch, has a great story, great music, the list goes on and on, its OK me saying how good it is but everyone will take there own best bits away with them once they have seen it, yes the animation is top notch and beautiful to watch, it does show its age in a very few parts but that has now become part of it beauty, i am so glad it has came out on DVD as it is one of my top 10 films of all time. Buy it or rent it just see it, best viewing is at night alone with drink and food in reach so you don't have to stop the film.<br /><br />Enjoy

In [None]:
len(os.listdir("aclImdb/train/neg/"))

12500

In [None]:
def create_dataset(path_):
  dataset = []

  for data_path in os.listdir(path_):
    with open(os.path.join(path_, data_path)) as f:
      dataset.append(f.read())
  return dataset

pos_ = create_dataset(os.path.join(DATA_DIR, "train", "pos"))
neg_ = create_dataset(os.path.join(DATA_DIR, "train", "neg"))
dataset = pos_ + neg_

In [None]:
len(dataset)

25000

In [None]:
import string
from tqdm import tqdm

class Vectorizer:

  def standardize(self, text):
    text = text.lower()
    return "".join(char for char in text
                  if char not in string.punctuation)

  def tokenize(self, text):
    text = self.standardize(text)
    return text.split()

  def make_vocabulary(self, dataset):
    self.vocabulary = {"": 0, "[UNK]": 1}
    for text in tqdm(dataset):
      text = self.standardize(text)
      tokens = self.tokenize(text)
      for token in tokens:
        if token not in self.vocabulary:
          self.vocabulary[token] = len(self.vocabulary)
      self.inverse_vocabulary = dict((v, k) for k, v in self.vocabulary.items())

  def encode(self, text):
    text = self.standardize(text)
    tokens = self.tokenize(text)
    return [self.vocabulary.get(token, 1) for token in tokens]

  def decode(self, int_sequence):
    return " ".join(self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence)

In [None]:
class Vocabulary:

  def __init__(self, freq=1):
    self.stoi = {"<pad>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
    self.itos = {0: "<pad>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}

    self.freq = freq

  def standardize(self, text):
    text = text.lower()
    return "".join(char for char in text
                  if char not in string.punctuation)

  def tokenize(self, text):
    text = self.standardize(text)
    return text.split()

  def make_vocabulary(self, dataset):
    temp_vocab = {}

    for text in tqdm(dataset):

      text = self.standardize(text)
      tokens = self.tokenize(text)

      for token in tokens:
        if token not in temp_vocab:
          temp_vocab[token] = 1
        else:
          temp_vocab[token] +=1

        if temp_vocab[token] == self.freq:
          indx = len(self.stoi)
          self.stoi[token] = indx
          self.itos[indx] = token


  def encode(self, text):
    text = self.standardize(text)
    tokens = self.tokenize(text)
    return ([self.stoi["<SOS>"]] + [self.stoi.get(token, 3) for token in tokens]
            + [self.stoi["<EOS>"]])

  def decode(self, int_sequence):
    return " ".join(self.itos.get(i, "<UNK>") for i in int_sequence)

In [None]:
tokenizer = Vocabulary()
tokenizer.make_vocabulary(dataset)

In [None]:
text_index = tokenizer.encode(dataset[10])
tokenizer.decode(text_index)

'<SOS> a strong woman oriented subject after long director krishna vamsis shakti the power the desi version of the hollywood hit not without my daughter is actress sridevis first homeproduction a story about a womans fight against harsh injusticebr br the story of the film revolves around nandini karisma kapoor who lives in canada with her two uncles tiku talsania jaspal bhatti there she meets shekhar sanjay kapoor falls in love with him and they soon marry their family is complete when nandini has a boy raja master jai gidwani but their happiness is short lived as the news of shekhars ailing mother deepti navalmakes them leave their perfect life in canada and come to india and thats when the problems start from the moment they reachbr br india both are shocked to see the pollution and the vast throngs of people everywhere they take a crowded train to reach shekhars village and when they finally reach the station they have to catch a long bus drive to his village the filthy sweaty bus 

In [None]:
from glob import glob
import random
from typing import List

def build_vocab(data_dir: str, tokenizer: Vocabulary) -> Vocabulary:
  dataset = []
  path_list = []

  path_ = os.path.join(data_dir, "train")
  path_list = (glob(os.path.join(path_, "pos", "*.txt")) +
                    glob(os.path.join(path_, "neg", "*.txt")))

  for data_path in path_list:
    with open(data_path) as f:
      dataset.append(f.read())


  tokenizer.make_vocabulary(dataset)
  return tokenizer


class IMDBDataset(Dataset):

  def __init__(self, data_dir, tokenizer: Vocabulary,
               train=True, transform=None, target_transform=None,
               random_state=42):

    self.data_dir = data_dir
    self.data = []

    if train:
      path_ = os.path.join(data_dir, "train")
    else:
      path_ = os.path.join(data_dir, "test")

    for label in ["pos", "neg"]:
        data_path = os.path.join(path_, label)
        for i in glob(data_path + "/*.txt"):
          self.data.append((i, label == "pos"))

    random.Random(random_state).shuffle(self.data)

    self.nlp = tokenizer


  def __len__(self):
    return len(self.data)

  def __getitem__(self, indx):
    path_data, label = self.data[indx]

    with open(path_data, "r") as f:
      text_data = f.read()
      data = self.nlp.encode(text_data)
    return torch.tensor(data).long(), torch.tensor([label])


In [None]:
tokenizer=Vocabulary(freq=2)
nlp = build_vocab(DATA_DIR, tokenizer)

100%|██████████| 25000/25000 [00:13<00:00, 1893.63it/s]


In [None]:
len(nlp.stoi)

57553

In [None]:
train_dataset = IMDBDataset(DATA_DIR, tokenizer=nlp, train=True)
print("train dataset size:", len(train_dataset))
test_dataset = IMDBDataset(DATA_DIR, tokenizer=nlp, train=False)
print("test dataset size:", len(test_dataset))


train dataset size: 25000
test dataset size: 25000


In [None]:
from torch.nn.utils.rnn import pad_sequence

def pad_collate(x):
  data = [item[0].unsqueeze(1) for item in x]
  label = [item[1] for item in x]
  padded_data = pad_sequence(data)
  return padded_data.squeeze(), torch.tensor(label)

In [None]:
# TODO: better padding!

In [None]:
train_dl = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=pad_collate)
test_dl = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=pad_collate)

In [None]:
for item, label in train_dl:
  print(f"data size {item.size()}")
  print(f"label size {label.size()}")
  break


data size torch.Size([924, 32])
label size torch.Size([32])


In [None]:
features, _ = next(iter(train_dl))
embedding_ = nn.Embedding(len(nlp.stoi), 256, padding_idx=0)
embedding_(features).size()

torch.Size([486, 32, 256])

In [None]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, output_size, hidden_size=128,
                 embedding_size=400, n_layers=2, dropout=0.2):

        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, dropout=dropout)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        x, _ =  self.lstm(x)
        x = x[-1, : , :]
        x = self.dropout(x)
        x = self.fc(x)
        x = self.sigmoid(x)

        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [None]:
# model hyperparamters
vocab_size = len(nlp.stoi)
output_size = 1
embedding_size = 256
hidden_size = 512
grad_clip = 4
n_layers = 2
epochs_num = 10

# model initialization
model = LSTMModel(vocab_size, output_size, hidden_size, embedding_size,
                  n_layers).to(device)
print(model)

LSTMModel(
  (embedding): Embedding(57553, 256, padding_idx=0)
  (lstm): LSTM(256, 512, num_layers=2, dropout=0.2)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=512, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [None]:
data, label = next(iter(train_dl))
model(data.to(device))

tensor([[0.5085],
        [0.5084],
        [0.5105],
        [0.5108],
        [0.5085],
        [0.5099],
        [0.5092],
        [0.5061],
        [0.5067],
        [0.5104],
        [0.5076],
        [0.5067],
        [0.5072],
        [0.5058],
        [0.5096],
        [0.5122],
        [0.5076],
        [0.5061],
        [0.5081],
        [0.5107],
        [0.5133],
        [0.5063],
        [0.5089],
        [0.5083],
        [0.5070],
        [0.5096],
        [0.5126],
        [0.5035],
        [0.5056],
        [0.5092],
        [0.5092],
        [0.5085]], grad_fn=<SigmoidBackward0>)

In [None]:
lr = 0.001
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
for epoch in range(epochs_num):

    model.train()

    train_loss = 0

    for id, (data, label) in enumerate(tqdm(train_dl)):
        data, label = data.to(device), label.to(device)

        optimizer.zero_grad()

        # forward pass
        prediction = model(data)

        # loss
        loss = criterion(prediction.squeeze(), label.float())
        loss.backward()
        train_loss += loss.item()
        nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        optimizer.step()
        if id % 200 == 199:
          print(f"\n Epoch {epoch+1}/{epochs_num}| step {id+1}/{len(train_dl)} train_loss: {train_loss/(id + 1):.4f}")

  0%|          | 1/782 [02:11<28:36:27, 131.87s/it]