# IMDB Reviews Classification
### Language Model vs End-to-End

##### Setup

In [1]:
import os
import re
from collections import Counter
import random

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_sequence, unpack_sequence
from torch.utils.data import Dataset, DataLoader

from nltk.tokenize import word_tokenize, sent_tokenize

from spacy.tokens import Token
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

##### Load the data

In [2]:
def read_reviews(dir_path, max_read=None):
    review_files = os.listdir(dir_path)
    review_files = random.sample(review_files, max_read) if max_read else review_files
    print(f"Read {len(review_files)} reviews from {dir_path}")

    reviews = []
    for review_file in review_files:
        with open(os.path.join(dir_path, review_file)) as f:
            reviews.append(f.read())

    return reviews, [int(re.findall(r'_(\d+)', filename)[0]) for filename in review_files]

In [3]:
TRAIN_PATH = r"aclImdb\train"
TEST_PATH = r"aclImdb\test"
NEGATIVE_DIR = "neg"
POSITIVE_DIR = "pos"
UNLABELED_DIR = "unsup"

SAMPLE_SIZE = 10

In [4]:
train_pos_text, train_pos_rating = read_reviews(os.path.join(TRAIN_PATH, POSITIVE_DIR), SAMPLE_SIZE)
train_neg_text, train_neg_rating = read_reviews(os.path.join(TRAIN_PATH, NEGATIVE_DIR), SAMPLE_SIZE)
train_unlabeled_text, _ = read_reviews(os.path.join(TRAIN_PATH, UNLABELED_DIR), SAMPLE_SIZE)
test_pos_text, test_pos_rating = read_reviews(os.path.join(TEST_PATH, POSITIVE_DIR), SAMPLE_SIZE)
test_neg_text, test_neg_rating = read_reviews(os.path.join(TEST_PATH, NEGATIVE_DIR), SAMPLE_SIZE)

Read 10 reviews from aclImdb\train\pos
Read 10 reviews from aclImdb\train\neg
Read 10 reviews from aclImdb\train\unsup
Read 10 reviews from aclImdb\test\pos
Read 10 reviews from aclImdb\test\neg


## 1 Language Modeling

### 1.1 Data Exploration & Analysis

Check for line break tags

In [5]:
LINE_BREAK = '<br />'

count = 10
for text in train_pos_text:
    if LINE_BREAK in text:
        print(text)
        count -= 1
        if count == 0:
            break

- SMALL SPOILER HEREIN! - <br /><br />When I looked at the votes for "Creep" today, I was surprised about so many IMDb-users rating this movie "1". I am wondering: what do people expect of such kind of movie? Are there so many people watching a movie without knowing anything about it?<br /><br />"Creep" is a HORROR movie and it is a pretty good one! This automatically means: it has a absurd story full of holes, outrageous hazards and simple one-side-characters. So why complain about it? Just take some popcorn and coke, make yourself comfortable in your seat and then... ...enjoy to be scared to death! The first 60 minutes when there is almost nothing else than Kate and a lonely subway station are incredibly scaring. There is suspense and fear in every corner of the screen and you will give some jerks just because of a sudden sound of a blinking neon lamp in the back of you. (In my opinion the sound editor did the best job in this movie.)<br /><br />When Kate meet her pursuer the quality

In [6]:
def count_line_break_usage(reviews):
    count = 0
    for i, t in enumerate(reviews):
        if LINE_BREAK in t:
            count += 1
    return count

In [7]:
print(count_line_break_usage(train_pos_text),
      count_line_break_usage(train_neg_text))

5 5


Some many line breaks.. we will need to deal with them.\
We will surround them with spaces and remove spaces inside the tags.

In [8]:
NEW_LINE_BREAK = ' ~br~ '

train_pos_text = [text.replace(LINE_BREAK, NEW_LINE_BREAK) for text in train_pos_text]
train_neg_text = [text.replace(LINE_BREAK, NEW_LINE_BREAK) for text in train_neg_text]

In [9]:
print(count_line_break_usage(train_pos_text),
      count_line_break_usage(train_neg_text))

0 0


Let's tokenize the reviews

In [None]:
tokenizer = Tokenizer(English().vocab)

In [None]:
def tokenize_reviews(reviews):
    return [tokenizer(review) for review in reviews]

In [None]:
train_pos_tokens = tokenize_reviews(train_pos_text)
train_neg_tokens = tokenize_reviews(train_neg_text)

Let's count tokens

In [None]:
all_tokens = list(set([
    tok.text
    for review in train_pos_tokens + train_neg_tokens
    for tok in review
]))

tok2id = { tok: i for i, tok in enumerate(all_tokens) }

In [None]:
def count_tokens(tokenized_reviews):
    return Counter([
        tok2id[tok.text]
        for review in tokenized_reviews
        for tok in review
    ])

In [None]:
pos_token_counter = count_tokens(train_pos_tokens)
neg_token_counter = count_tokens(train_neg_tokens)

Let's compare the usage of some meaningful words

In [None]:
# meaningful_words = ['best', 'love', 'fantastic', 'terrific', 'horrible', 'worst', 'bad']

# for word in meaningful_words:
#     pos = pos_token_counter[tok2id[word]]
#     neg = neg_token_counter[tok2id[word]]
#     print(f"{word}: {pos=}, {neg=}")

### 1.2 Dataset & Dataloader

In [12]:
class LanguageModelDataset(Dataset):
    def __init__(self, texts):

        # Special characters
        self.start = '<'
        self.end = '>'

        # Actual pairs of samples
        self.X = []
        self.y = []

        # Construct the actual dataset
        self.__create_dataset(texts)

    def __create_dataset(self, texts):        

        # Construct token hierarchy
        tokens = [
            [self.start] + word_tokenize(s) + [self.end] 
            for text in texts
            for s in sent_tokenize(text)
        ]

        # Build dictionary
        self.id2word = []
        for s in tokens:
            self.id2word = list(set(self.id2word + s))

        self.word2id = {word: i for i, word in enumerate(self.id2word)}
        self.id2word = {i: word for i, word in enumerate(self.id2word)}

        # For every sentence
        for s in tokens:
            
            # For every target word
            for i, wi in enumerate(s[1:], start=1):

                # word ids
                self.X.append([self.word2id[w] for w in s[:i]])

                # target word id
                self.y.append(self.word2id[s[i]])

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return self.X[i], self.y[i]

In [13]:
full_text = train_pos_text + train_neg_text + train_unlabeled_text
imdb_train_dataset = LanguageModelDataset(full_text)

In [15]:
print(len(imdb_train_dataset))
for i in range(10):
    x, y = imdb_train_dataset[i]
    print(x, y)

print(imdb_train_dataset.word2id['<'])
print(imdb_train_dataset.word2id['>'])

8147
[365] 1355
[365, 1355] 343
[365, 1355, 343] 658
[365, 1355, 343, 658] 69
[365, 1355, 343, 658, 69] 1567
[365, 1355, 343, 658, 69, 1567] 53
[365] 1355
[365, 1355] 320
[365, 1355, 320] 320
[365, 1355, 320, 320] 780
365
53


In [16]:
def my_collate_fn(batch):
    features = pack_sequence([torch.tensor(sample[0], dtype=torch.long) for sample in batch], enforce_sorted=False)
    labels = torch.tensor([sample[1] for sample in batch], dtype=torch.long)
    return features, labels

In [21]:
imdb_train_dataloader = DataLoader(imdb_train_dataset, batch_size=64, shuffle=True, collate_fn=my_collate_fn)

### 1.3 Model Definition & Training

In [22]:
class LanguageModel(nn.Module):
    def __init__(self, vocab_size, input_dim, cell_dim, num_layers=1):
        super().__init__()
        
        # Params
        self.vocab_size = vocab_size
        self.input_dim = input_dim
        self.cell_dim = cell_dim
        self.num_layers = num_layers

        # Net
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=input_dim)
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=cell_dim, num_layers=num_layers)
        self.fc1 = nn.Linear(in_features=cell_dim, out_features=min(cell_dim * 2, (cell_dim + vocab_size) // 2))
        self.fc2 = nn.Linear(in_features=min(cell_dim * 2, (cell_dim + vocab_size) // 2), out_features=vocab_size)

    def forward(self, x):

        # Select embeddings
        unpacked_x = unpack_sequence(x)
        batch_size = len(unpacked_x)
        embeddings = [self.embedding(sample) for sample in unpacked_x]
        packed_embeddings = pack_sequence(embeddings, enforce_sorted=False)

        # Initialize hidden & cell states for LSTM
        h_0 = torch.zeros(self.num_layers, batch_size, self.cell_dim)
        c_0 = torch.zeros(self.num_layers, batch_size, self.cell_dim)

        # Forward LSTM
        output, (h_n, c_n) = self.lstm(packed_embeddings, (h_0, c_0))

        # Forward Fully-Connected
        pred = F.relu(self.fc1(c_n[0]))
        pred = self.fc2(pred)

        return pred
        

In [23]:
imdb_lm = LanguageModel(vocab_size=len(imdb_train_dataset.id2word), input_dim=100, cell_dim=100, num_layers=1)

In [24]:
len(imdb_train_dataloader)

128

In [25]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(imdb_lm.parameters())

imdb_lm.train()
num_epochs = 10
verbose_freq = 32

for e in range(1, num_epochs+1):
    print(f"--------------- epoch {e} ---------------")

    running_loss = 0.
    last_loss = 0.

    for i, (X, y) in enumerate(imdb_train_dataloader, start=1):

        # Reset optimizer
        optimizer.zero_grad()

        # Forward
        predictions = imdb_lm(X)

        # Backward
        loss = loss_fn(predictions, y)
        loss.backward()
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % verbose_freq == 0:
            last_loss = running_loss / verbose_freq
            running_loss = 0.
            print(f'  batch {i}/{len(imdb_train_dataloader)} loss: {last_loss}')

--------------- epoch 1 ---------------
  batch 32/128 loss: 7.221545219421387
  batch 64/128 loss: 6.406396195292473
  batch 96/128 loss: 6.209994316101074
  batch 128/128 loss: 6.228965267539024
--------------- epoch 2 ---------------
  batch 32/128 loss: 5.587084472179413
  batch 64/128 loss: 5.532309025526047
  batch 96/128 loss: 5.49144184589386
  batch 128/128 loss: 5.470043316483498
--------------- epoch 3 ---------------
  batch 32/128 loss: 5.049678564071655
  batch 64/128 loss: 4.969946637749672
  batch 96/128 loss: 5.050805062055588
  batch 128/128 loss: 4.99445204436779
--------------- epoch 4 ---------------
  batch 32/128 loss: 4.5196719616651535
  batch 64/128 loss: 4.519476473331451
  batch 96/128 loss: 4.482130333781242
  batch 128/128 loss: 4.374301441013813
--------------- epoch 5 ---------------
  batch 32/128 loss: 3.842666707932949
  batch 64/128 loss: 3.8551588356494904
  batch 96/128 loss: 3.8436590805649757
  batch 128/128 loss: 3.7223614379763603
-------------

In [37]:
def generate(gpt, start, end, id2word, word2id):
    generated = []

    tok = word2id[start]
    word = start
    while True:

        # generate
        t = torch.tensor([tok])
        p = pack_sequence([t])
        tok = torch.argmax(gpt(p)).item()
        word = id2word[tok]
        
        if word == end:
            break

        generated.append(word)
    
    return ' '.join(generated)

generate(imdb_lm, 
         imdb_train_dataset.start,
         imdb_train_dataset.end,
         imdb_train_dataset.id2word,
         imdb_train_dataset.word2id)

'The Plot : Sixteen-year-old Alice Palmer and Alessi Boni ( cute , art .'

### 1.4 Evaluation

## 2 Text Classification

### 2.1 Dataset & Dataloader

### 2.2 Model Definition

### 2.3 Training

#### 2.3-A Language Model

#### 2.3-B End-to-End

### 2.4 Evaluation

#### 2.4-A Language Model

#### 2.4-B End-to-End