# Introduction

based on : https://github.com/bentrevett/pytorch-sentiment-analysis

one-layer RNN Classifier with IMDB datasets
used custom RNN

# 0. Set Environment

In [133]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import torchtext
torchtext.disable_torchtext_deprecation_warning()
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.tensorboard import SummaryWriter

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from pprint import pprint

import subprocess
import os
import sys

import datasets

In [134]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Selected device:", device)

Selected device: cuda


In [135]:
seed = 42

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [136]:
model_dir = './models/Custom_RNN_Classifier_model.pth'
pretrained_embedding_dir = './models/Glove_pretrained.pth'

# 1. Data processing

## 1-1. Get Data

In [137]:
train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])

In [138]:
print(train_data, test_data)

Dataset({
    features: ['text', 'label'],
    num_rows: 25000
}) Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})


In [139]:
pprint(train_data[0])

{'label': 0,
 'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the '
         'controversy that surrounded it when it was first released in 1967. I '
         'also heard that at first it was seized by U.S. customs if it ever '
         'tried to enter this country, therefore being a fan of films '
         'considered "controversial" I really had to see this for myself.<br '
         '/><br />The plot is centered around a young Swedish drama student '
         'named Lena who wants to learn everything she can about life. In '
         'particular she wants to focus her attentions to making some sort of '
         'documentary on what the average Swede thought about certain '
         'political issues such as the Vietnam War and race issues in the '
         'United States. In between asking politicians and ordinary denizens '
         'of Stockholm about their opinions on politics, she has sex with her '
         'drama teacher, classmates, and married men.<br

## 1-2. Tokenize

In [140]:
tokenizer = get_tokenizer("basic_english")

In [141]:
pprint(tokenizer("Hello world! How are you doing today? I'm doing fantastic!"))

['hello',
 'world',
 '!',
 'how',
 'are',
 'you',
 'doing',
 'today',
 '?',
 'i',
 "'",
 'm',
 'doing',
 'fantastic',
 '!']


In [142]:
def tokenize_example(example, tokenizer, max_length):
    tokens = tokenizer(example["text"])[:max_length]
    length = len(tokens)
    return {"tokens": tokens, "length": length}

In [143]:
max_length = 256

train_data = train_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)
test_data = test_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)

In [144]:
print(train_data, test_data)

Dataset({
    features: ['text', 'label', 'tokens', 'length'],
    num_rows: 25000
}) Dataset({
    features: ['text', 'label', 'tokens', 'length'],
    num_rows: 25000
})


In [145]:
pprint(train_data[0])

{'label': 0,
 'length': 256,
 'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the '
         'controversy that surrounded it when it was first released in 1967. I '
         'also heard that at first it was seized by U.S. customs if it ever '
         'tried to enter this country, therefore being a fan of films '
         'considered "controversial" I really had to see this for myself.<br '
         '/><br />The plot is centered around a young Swedish drama student '
         'named Lena who wants to learn everything she can about life. In '
         'particular she wants to focus her attentions to making some sort of '
         'documentary on what the average Swede thought about certain '
         'political issues such as the Vietnam War and race issues in the '
         'United States. In between asking politicians and ordinary denizens '
         'of Stockholm about their opinions on politics, she has sex with her '
         'drama teacher, classmates, and

## 1-3. Build Vocab 

In [146]:
min_freq = 5
special_tokens = ["<unk>", "<pad>"]

vocab = build_vocab_from_iterator(train_data['tokens'],
                                  min_freq = min_freq,
                                  specials = special_tokens)

In [147]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

vocab.set_default_index(unk_index)

In [148]:
print(len(vocab))

24897


In [149]:
print(unk_index, pad_index)

0 1


## 1-4. Numericalize Text

In [150]:
def numericalize_example(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}

In [151]:
train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

In [152]:
train_data = train_data.with_format(type="torch", columns=["ids", "label", "length"])
test_data = test_data.with_format(type="torch", columns=["ids", "label", "length"])

In [153]:
print(train_data, test_data)

Dataset({
    features: ['text', 'label', 'tokens', 'length', 'ids'],
    num_rows: 25000
}) Dataset({
    features: ['text', 'label', 'tokens', 'length', 'ids'],
    num_rows: 25000
})


In [154]:
pprint(train_data[0])

{'ids': tensor([   12,  1242,    12,   220,     0,    44,    61,   362,  1035,    90,
            7,    37,     2,  7142,    15,  3319,    11,    60,    11,    17,
           80,   569,    13,  7559,     3,    12,    99,   508,    15,    38,
           80,    11,    17, 24372,    40,  1095,     3,    16,     3, 10340,
           52,    11,   125,   747,     8,  2389,    14,   644,     4,  1644,
          123,     5,   314,     7,   116,  1121,  3029,    12,    68,    72,
            8,    73,    14,    21,   496,     3,     2,   114,    10,  5778,
          195,     5,   182,  3517,   442,  1306,   726,  5178,    42,   509,
            8,   865,   293,    63,    59,    47,   126,     3,    13,   859,
           63,   509,     8,  1157,    51, 11838,     8,   263,    55,   457,
            7,   606,    27,    54,     2,   811,     0,   190,    47,   805,
         1045,  1284,   145,    19,     2,  2353,   331,     6,  1506,  1284,
           13,     2,  2238,  1530,     3,    13,   215,

## 1-5. Word Embedding

In [155]:
if not os.path.exists(pretrained_embedding_dir):
    vectors = torchtext.vocab.GloVe()
    pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
    torch.save(pretrained_embedding, pretrained_embedding_dir)
else:
    pretrained_embedding = torch.load(pretrained_embedding_dir)

In [156]:
print(pretrained_embedding.size())

torch.Size([24897, 300])


## 1-6. Prepare for Data Loading

In [157]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        return item

In [158]:
train_dataset = CustomDataset(train_data)
test_dataset = CustomDataset(test_data)

In [159]:
def custom_collate_fn(batch):
    
    batch_inputs = [sample['ids'] for sample in batch]
    batch_labels = [sample['label'] for sample in batch]
    
    collate_inputs = pad_sequence(batch_inputs, 
                                  padding_value = pad_index, 
                                  batch_first = True)
    collate_labels = torch.tensor(batch_labels)
    
    return collate_inputs, collate_labels

In [160]:
batch_size = 128
pad_index = pad_index
shuffle = True

trainloader = DataLoader(dataset = train_dataset,
                         batch_size = batch_size,
                         collate_fn = custom_collate_fn,
                         shuffle = shuffle)
testloader = DataLoader(dataset = test_dataset,
                         batch_size = batch_size,
                         collate_fn = custom_collate_fn,
                         shuffle = shuffle)

# 2. Define Model

## 2-1. Model Structure

In [161]:
class CustomRNNCell(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.Wi = nn.Linear(input_dim + hidden_dim, hidden_dim)
        
    def forward(self, inputs, hidden):
        h = hidden
        concat_ih = torch.cat((inputs,h), 1)

        h = self.Wi(concat_ih)
        
        return h

In [162]:
class CustomRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layer = num_layers
        self.Layers = nn.ModuleList([CustomRNNCell(input_dim, hidden_dim)])
        for _ in range(1, num_layers):
            self.Layers.append(CustomRNNCell(hidden_dim, hidden_dim))

    def forward(self, inputs, hidden):
        
        batch_size = inputs.size(0)
        seq_length = inputs.size(1)

        h0 = hidden

        output_h = torch.zeros(self.num_layers, batch_size, seq_length, self.hidden_dim).to(device)

        for layer_idx, layer in enumerate(self.Layers):

            if layer_idx == 0:
                layer_inputs = inputs
            else:
                layer_inputs = output_h[layer_idx - 1, :, :, :]
            
            h = h0[layer_idx, : :]
            for t in range(seq_length):
                h = layer(layer_inputs[:, t, :], h)
                output_h[layer_idx, :, t, :] = h

        return output_h[-1, :, :, :], output_h[:, :, -1, :]

In [163]:
class CustomRNNClassifier(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers, pad_index):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_index)
        self.embedding.requires_grad_(False)

        self.rnn = CustomRNN(embedding_dim, hidden_dim, num_layers)

        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, inputs):

        batch_size = inputs.size(0)
        
        x = self.embedding(inputs)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)

        output, h = self.rnn(x, h0)

        logit = self.fc(h[-1, :, :])

        return logit

## 2-2. Hyperparameter & functions

In [164]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 128
num_layers = 1
output_dim = 1
pad_index = pad_index
lr = 5e-4

model = CustomRNNClassifier(vocab_size, embedding_dim, hidden_dim, output_dim, num_layers, pad_index)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## 2-3. Weight Initialization

In [165]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, CustomRNNCell):
        for name, param in m.named_parameters():
            if "bias" in name:
                nn.init.zeros_(param)
            elif "Wi" in name:
                nn.init.orthogonal_(param)

In [166]:
model.apply(initialize_weights)

CustomRNNClassifier(
  (embedding): Embedding(24897, 300, padding_idx=1)
  (rnn): CustomRNN(
    (Layers): ModuleList(
      (0): CustomRNNCell(
        (Wi): Linear(in_features=428, out_features=128, bias=True)
      )
      (1-2): 2 x CustomRNNCell(
        (Wi): Linear(in_features=256, out_features=128, bias=True)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [167]:
model.embedding.weight.data = pretrained_embedding

In [168]:
pprint("Model's state_dict:")
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"    Size : {param.size()}")
    print(f"    Value: {param}")

"Model's state_dict:"
Parameter name: embedding.weight
    Size : torch.Size([24897, 300])
    Value: Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.2720, -0.0620, -0.1884,  ...,  0.1302, -0.1832,  0.1323],
        ...,
        [ 0.6701, -0.2717,  0.4766,  ...,  0.2786,  0.3312,  0.0230],
        [-0.1503,  0.5624, -0.5622,  ..., -0.4224, -0.6836,  0.0726],
        [ 1.1741, -0.4386,  0.3310,  ...,  0.3193, -0.2292, -0.0887]])
Parameter name: rnn.Layers.0.Wi.weight
    Size : torch.Size([128, 428])
    Value: Parameter containing:
tensor([[ 0.0305, -0.0240,  0.0291,  ..., -0.0672,  0.0179,  0.0439],
        [-0.0524,  0.0628,  0.0119,  ...,  0.0646, -0.0570,  0.0005],
        [ 0.0324, -0.1108, -0.0452,  ..., -0.0172,  0.0443, -0.0367],
        ...,
        [ 0.0388,  0.0423,  0.0163,  ..., -0.1018, -0.0417, -0.0136],
        [ 0.0046, -0.0277,  0.1003,  ..., -

In [169]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 120,833 trainable parameters


In [170]:
model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

## 2-4. Tensorboard

In [171]:
writer = SummaryWriter()

# 3. Train Model

In [172]:
def train_model(model, criterion, optimizer, trainloader, num_epochs):
    print("-----Training Started------")
    for epoch in range(num_epochs):
        
        model.train()
        
        running_loss = 0.0
        
        for inputs, labels in tqdm(trainloader):
            
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1), labels.float())
            loss.backward()
            optimizer.step()

            writer.add_scalar('Loss/train', loss.item(), epoch)

            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(trainloader.dataset)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss: .4f}")

        torch.save(model.state_dict(), model_dir)
    
    writer.close()
    
    print("-----Training Completed-----")

In [173]:
num_epochs = 16

train_model(model, criterion, optimizer, trainloader, num_epochs)

-----Training Started------


100%|██████████| 196/196 [01:26<00:00,  2.25it/s]


Epoch [1/16], Loss:  0.6936


 17%|█▋        | 33/196 [00:14<01:12,  2.24it/s]


KeyboardInterrupt: 

# 4. Test

In [None]:
def test_model(model, testloader):
    model.eval()
    correct = 0
    total = 0
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            predictions = torch.round(F.sigmoid(outputs))
            total += labels.size(0)

            predictions, labels = predictions.view(-1).cpu(), labels.cpu()
            
            correct += (predictions == labels).sum().item()

            TP += ((predictions == 1) & (labels == 1)).sum().item()
            TN += ((predictions == 0) & (labels == 0)).sum().item()
            FP += ((predictions == 1) & (labels == 0)).sum().item()
            FN += ((predictions == 0) & (labels == 1)).sum().item()

    accuracy = correct / total
    precision = TP / (TP + FP) if TP + FP != 0 else 0
    recall = TP / (TP + FN) if TP + FN != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0

    print(f"Accuracy on test set: {accuracy:.4f}")
    print(f"Precision on test set: {precision:.4f}")
    print(f"Recall on test set: {recall:.4f}")
    print(f"F1 Score on test set: {f1:.4f}")


In [None]:
model.load_state_dict(torch.load(model_dir))
model.to(device)

CustomRNNClassifier(
  (embedding): Embedding(24897, 300, padding_idx=1)
  (rnn): CustomRNN(
    (Layers): ModuleList(
      (0): CustomRNNCell(
        (Wi): Linear(in_features=428, out_features=128, bias=True)
      )
      (1): CustomRNNCell(
        (Wi): Linear(in_features=256, out_features=128, bias=True)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [None]:
test_model(model, testloader)

Accuracy on test set: 0.5097
Precision on test set: 0.5280
Recall on test set: 0.1834
F1 Score on test set: 0.2723


# 5. Inference

In [None]:
def inference(text, model, tokenizer, vocab):
    tokens = tokenizer(text)
    ids = vocab.lookup_indices(tokens)
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)

    output = model(tensor)

    prediction = torch.round(F.sigmoid(output))

    return prediction

In [None]:
model.load_state_dict(torch.load(model_dir))
model.to(device)

CustomRNNClassifier(
  (embedding): Embedding(24897, 300, padding_idx=1)
  (rnn): CustomRNN(
    (Layers): ModuleList(
      (0): CustomRNNCell(
        (Wi): Linear(in_features=428, out_features=128, bias=True)
      )
      (1): CustomRNNCell(
        (Wi): Linear(in_features=256, out_features=128, bias=True)
      )
    )
  )
  (fc): Linear(in_features=128, out_features=1, bias=True)
)

In [None]:
text = 'This film is terrible!'

prediction = inference(text, model, tokenizer, vocab)

print(prediction.item())

0.0


In [None]:
text = 'This film is great!'

prediction = inference(text, model, tokenizer, vocab)

print(prediction.item())

0.0


In [None]:
text = 'The best film I have ever seen!'

prediction = inference(text, model, tokenizer, vocab)

print(prediction.item())

1.0


In [None]:
text = "This film is not terrible, it's great!"

prediction = inference(text, model, tokenizer, vocab)

print(prediction.item())

1.0


# Limitation

padding 토큰 연산 제외 / dropout / validation