# Discriminative Model: RNN

## Data
We use "Large Movie Review Dataset" (http://ai.stanford.edu/~amaas/data/sentiment/)

## Presprocessing
We used one-hot encoding. To keep the results small, the vocabulary consists of only the top 25,000 words in the review corpus, a "unknown" token for every other word, plus a token for RNN input padding.

## Network Structure
Our network takes a three-layer structure are : embedding layer, RNN layer and linear activation layer.

The embedding layer transforms one-hot vectors to dense vectors.

The nerual network we built takes a vanilla RNN structure. At each time step, it takes the current word embedding as well as the hidden state from previous word, and produces the current hidden state.

Produced hidden state then goes through a linear activation function. 

## 1 Preprocessing

In [43]:
import torch
from torchtext.legacy import data

SEED = 0 # random seed ensure same split of data

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field()
LABEL = data.LabelField(dtype = torch.float)

# TEXT = data.Field(tokenize = 'spacy',
#                   tokenizer_language = 'en_core_web_sm')

In [44]:
from torchtext.legacy import datasets

train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

In [45]:
import random

train_data, valid_data = train_data.split(random_state = random.seed(SEED))

### View Data

In [46]:
# print(f'Number of training examples: {len(train_data)}')
# print(f'Number of testing examples: {len(test_data)}')

In [47]:
# print(vars(train_data.examples[0]))
# print(train_data.examples[0].label == "pos")

### Build Vocabulary

In [48]:
MAX_VOCAB_SIZE = 25000

TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

## 2 RNN Model

### Model Structure

In [49]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        
        super().__init__()
        
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        
        self.activation = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):

        embedded = self.embedding(text)
        
        output, hidden = self.rnn(embedded)
        
        assert torch.equal(output[-1,:,:], hidden.squeeze(0))
        
        return self.activation(hidden.squeeze(0))

### Data Dimensions

In [50]:
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1

model = RNN(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

In [51]:
# print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
# print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

### Iterators

In [52]:
BATCH_SIZE = 64

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE)

### Training

In [53]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr = 1e-3)
criterion = nn.BCEWithLogitsLoss()

In [54]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    #round predictions to the closest integer
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [55]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
                
        predictions = model(batch.text).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = binary_accuracy(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [56]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:

            predictions = model(batch.text).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [57]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [58]:
# N_EPOCHS = 1

# best_valid_loss = float('inf')

# for epoch in range(N_EPOCHS):
#     train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
#     valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
#     if valid_loss < best_valid_loss:
#         best_valid_loss = valid_loss
#         torch.save(model.state_dict(), 'tut1-model.pt')
    
#     print(f'Train Loss: {train_loss:.2f}, Train Accuracy: {train_acc*100:.2f}%')
#     print(f'Valid Loss: {valid_loss:.2f}, Valid Accuracy: {valid_acc*100:.2f}%')

train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)

torch.save(model.state_dict(), 'rnn-imdb-sentiment.pt')

print(f'Train Loss: {train_loss:.2f}, Train Accuracy: {train_acc*100:.2f}%')
print(f'Test Loss: {valid_loss:.2f}, Test Accuracy: {valid_acc*100:.2f}%')

Train Loss: 0.69, Train Accuracy: 49.78%
Valid Loss: 0.69, Valid Accuracy: 50.41%


## Use saved model states

In [60]:
model.load_state_dict(torch.load('rnn-imdb-sentiment.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.2f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.70 | Test Acc: 46.64%


## 3 Bayes

In [None]:
# from collections import Counter

# def get_corpus(corpus, sentiment):
#     """
#     get sentences in corpus based on sentiment 

#     corpus: list of review sentences
#     sentiment: sentiment label of sentence, 'pos' or 'neg'
#     """
#     ans = []
#     for data in corpus.example:
#         if data.label == sentiment:
#             ans += data
#     return ans

# def count_text(text):
#     """
#     Count the number of words in a text.
#     """
#     return Counter(text)

# # get positive and negative reviews
# negative_corpus = get_corpus(train_data, 'neg')
# positive_corpus = get_corpus(train_data, 'pos')

# print(f"Number of negative reviews { len(negative_corpus) }")
# print(f"Number of positive reviews { len(positive_corpus) }")

### sklearn Naive Bayes

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn import metrics

# nb = MultinomialNB()

# # features = [train_data.examples[i].text for i in len(train_data)]
# # labels = train_data.examples[0].label for i in len(train_data)]

# # print(features)
# # print(labels)

# # nb.fit(features, labels)
