# Introduction

based on : https://github.com/bentrevett/pytorch-sentiment-analysis

one-layer RNN Classifier with IMDB datasets
used pytorch LSTM

# 0. Set Environment

In [126]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
import torchtext
torchtext.disable_torchtext_deprecation_warning()
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.tensorboard import SummaryWriter

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from pprint import pprint

import subprocess
import os
import sys

import datasets

In [127]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Selected device:", device)

Selected device: cuda


In [128]:
seed = 42

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

In [129]:
model_dir = './models/LSTM_Classifier_model.pth'
pretrained_embedding_dir = './models/Glove_pretrained.pth'

# 1. Data processing

## 1-1. Get Data

In [130]:
train_data, test_data = datasets.load_dataset("imdb", split=["train", "test"])

## 1-2. Tokenize

In [131]:
tokenizer = get_tokenizer("basic_english")

In [132]:
def tokenize_example(example, tokenizer, max_length):
    tokens = tokenizer(example["text"])[:max_length]
    length = len(tokens)
    return {"tokens": tokens, "length": length}

In [133]:
max_length = 256

train_data = train_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)
test_data = test_data.map(
    tokenize_example, fn_kwargs={"tokenizer": tokenizer, "max_length": max_length}
)

## 1-3. Build Vocab 

In [134]:
min_freq = 5
special_tokens = ["<unk>", "<pad>"]

vocab = build_vocab_from_iterator(train_data['tokens'],
                                  min_freq = min_freq,
                                  specials = special_tokens)

In [135]:
unk_index = vocab["<unk>"]
pad_index = vocab["<pad>"]

vocab.set_default_index(unk_index)

## 1-4. Numericalize Text

In [136]:
def numericalize_example(example, vocab):
    ids = vocab.lookup_indices(example["tokens"])
    return {"ids": ids}

In [137]:
train_data = train_data.map(numericalize_example, fn_kwargs={"vocab": vocab})
test_data = test_data.map(numericalize_example, fn_kwargs={"vocab": vocab})

In [138]:
train_data = train_data.with_format(type="torch", columns=["ids", "label", "length"])
test_data = test_data.with_format(type="torch", columns=["ids", "label", "length"])

## 1-5. Word Embedding

In [139]:
if not os.path.exists(pretrained_embedding_dir):
    vectors = torchtext.vocab.GloVe()
    pretrained_embedding = vectors.get_vecs_by_tokens(vocab.get_itos())
    torch.save(pretrained_embedding, pretrained_embedding_dir)
else:
    pretrained_embedding = torch.load(pretrained_embedding_dir)

## 1-6. Prepare for Data Loading

In [140]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        return item

In [141]:
train_dataset = CustomDataset(train_data)
test_dataset = CustomDataset(test_data)

In [142]:
def custom_collate_fn(batch):
    
    batch_inputs = [sample['ids'] for sample in batch]
    batch_labels = [sample['label'] for sample in batch]
    
    collate_inputs = pad_sequence(batch_inputs, 
                                  padding_value = pad_index, 
                                  batch_first = True)
    collate_labels = torch.tensor(batch_labels)
    
    return collate_inputs, collate_labels

In [143]:
batch_size = 128
pad_index = pad_index
shuffle = True

trainloader = DataLoader(dataset = train_dataset,
                         batch_size = batch_size,
                         collate_fn = custom_collate_fn,
                         shuffle = shuffle)
testloader = DataLoader(dataset = test_dataset,
                         batch_size = batch_size,
                         collate_fn = custom_collate_fn,
                         shuffle = shuffle)

# 2. Define Model

## 2-1. Model Structure

In [144]:
class DeepLSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, num_layers, output_dim, pad_index):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.output_dim = output_dim
        
        self.embedding = nn.Embedding(input_dim, embedding_dim, padding_idx = pad_index)
        self.embedding.requires_grad_(False)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first = True)

        self.fc1 = nn.Linear(hidden_dim, embedding_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, inputs):

        batch_size = inputs.size(0)
        seq_length = inputs.size(1)

        x = self.embedding(inputs)
            
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        hidden = (h0, c0)

        output, (h, c) = self.lstm(x, hidden)

        logit = self.fc2(h[-1, :, :])

        return logit

## 2-2. Hyperparameter & functions

In [145]:
vocab_size = len(vocab)
embedding_dim = 300
hidden_dim = 128
num_layers = 3
output_dim = 1
pad_index = pad_index
lr = 5e-4

model = DeepLSTM(vocab_size, embedding_dim, hidden_dim, num_layers, output_dim, pad_index)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

## 2-3. Weight Initialization

In [146]:
def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_normal_(m.weight)
        nn.init.zeros_(m.bias)
    elif isinstance(m, nn.LSTM):
        for name, param in m.named_parameters():
            if "bias" in name:
                nn.init.zeros_(param)
            else:
                nn.init.orthogonal_(param)

In [147]:
model.apply(initialize_weights)

DeepLSTM(
  (embedding): Embedding(24897, 300, padding_idx=1)
  (lstm): LSTM(300, 128, num_layers=3, batch_first=True)
  (fc1): Linear(in_features=128, out_features=300, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)

In [148]:
model.embedding.weight.data = pretrained_embedding

In [149]:
pprint("Model's state_dict:")
for name, param in model.named_parameters():
    print(f"Parameter name: {name}")
    print(f"    Size : {param.size()}")
    print(f"    Value: {param}")

"Model's state_dict:"
Parameter name: embedding.weight
    Size : torch.Size([24897, 300])
    Value: Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.2720, -0.0620, -0.1884,  ...,  0.1302, -0.1832,  0.1323],
        ...,
        [ 0.6701, -0.2717,  0.4766,  ...,  0.2786,  0.3312,  0.0230],
        [-0.1503,  0.5624, -0.5622,  ..., -0.4224, -0.6836,  0.0726],
        [ 1.1741, -0.4386,  0.3310,  ...,  0.3193, -0.2292, -0.0887]])
Parameter name: lstm.weight_ih_l0
    Size : torch.Size([512, 300])
    Value: Parameter containing:
tensor([[ 0.0455, -0.0546, -0.0071,  ..., -0.0428,  0.0080,  0.0017],
        [-0.0767, -0.0010, -0.0131,  ..., -0.0353,  0.0438,  0.0575],
        [-0.0591, -0.0995,  0.0224,  ..., -0.0677,  0.0174, -0.0077],
        ...,
        [-0.0056, -0.0513,  0.0004,  ..., -0.0784,  0.0689,  0.0528],
        [ 0.0969, -0.0112, -0.0040,  ...,  0.019

In [150]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f"The model has {count_parameters(model):,} trainable parameters")

The model has 523,181 trainable parameters


In [151]:
model.to(device)
criterion.to(device)

BCEWithLogitsLoss()

## 2-4. Tensorboard

In [152]:
writer = SummaryWriter()

# 3. Train Model

In [153]:
def train_model(model, criterion, optimizer, trainloader, num_epochs):
    print("-----Training Started------")
    for epoch in range(num_epochs):
        
        model.train()
        
        running_loss = 0.0
        
        for inputs, labels in tqdm(trainloader):
            
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1), labels.float())
            loss.backward()
            optimizer.step()

            writer.add_scalar('Loss/train', loss.item(), epoch)

            running_loss += loss.item() * inputs.size(0)

        epoch_loss = running_loss / len(trainloader.dataset)
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss: .4f}")

        torch.save(model.state_dict(), model_dir)
    
    writer.close()
    
    print("-----Training Completed-----")

In [154]:
num_epochs = 16

train_model(model, criterion, optimizer, trainloader, num_epochs)

-----Training Started------


100%|██████████| 196/196 [00:09<00:00, 20.24it/s]


Epoch [1/16], Loss:  0.6783


100%|██████████| 196/196 [00:10<00:00, 19.46it/s]


Epoch [2/16], Loss:  0.6588


100%|██████████| 196/196 [00:09<00:00, 19.64it/s]


Epoch [3/16], Loss:  0.6886


100%|██████████| 196/196 [00:10<00:00, 19.19it/s]


Epoch [4/16], Loss:  0.6929


100%|██████████| 196/196 [00:10<00:00, 19.27it/s]


Epoch [5/16], Loss:  0.6612


100%|██████████| 196/196 [00:10<00:00, 19.45it/s]


Epoch [6/16], Loss:  0.6774


100%|██████████| 196/196 [00:10<00:00, 19.34it/s]


Epoch [7/16], Loss:  0.6948


100%|██████████| 196/196 [00:10<00:00, 19.41it/s]


Epoch [8/16], Loss:  0.6937


100%|██████████| 196/196 [00:10<00:00, 19.43it/s]


Epoch [9/16], Loss:  0.6919


100%|██████████| 196/196 [00:10<00:00, 19.55it/s]


Epoch [10/16], Loss:  0.6732


 39%|███▉      | 76/196 [00:03<00:05, 22.36it/s]


KeyboardInterrupt: 

# 4. Test

In [None]:
def test_model(model, testloader):
    model.eval()
    correct = 0
    total = 0
    TP = 0  # True Positives
    TN = 0  # True Negatives
    FP = 0  # False Positives
    FN = 0  # False Negatives

    with torch.no_grad():
        for inputs, labels in testloader:
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            predictions = torch.round(F.sigmoid(outputs))
            total += labels.size(0)

            predictions, labels = predictions.view(-1).cpu(), labels.cpu()
            
            correct += (predictions == labels).sum().item()

            TP += ((predictions == 1) & (labels == 1)).sum().item()
            TN += ((predictions == 0) & (labels == 0)).sum().item()
            FP += ((predictions == 1) & (labels == 0)).sum().item()
            FN += ((predictions == 0) & (labels == 1)).sum().item()

    accuracy = correct / total
    precision = TP / (TP + FP) if TP + FP != 0 else 0
    recall = TP / (TP + FN) if TP + FN != 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall != 0 else 0

    print(f"Accuracy on test set: {accuracy:.4f}")
    print(f"Precision on test set: {precision:.4f}")
    print(f"Recall on test set: {recall:.4f}")
    print(f"F1 Score on test set: {f1:.4f}")


In [None]:
model.load_state_dict(torch.load(model_dir))
model.to(device)

DeepLSTM(
  (embedding): Embedding(24897, 300, padding_idx=1)
  (lstm): LSTM(300, 128, num_layers=3, batch_first=True)
  (fc1): Linear(in_features=128, out_features=300, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)

In [None]:
test_model(model, testloader)

Accuracy on test set: 0.8368
Precision on test set: 0.8195
Recall on test set: 0.8638
F1 Score on test set: 0.8411


# 5. Inference

In [None]:
def inference(text, model, tokenizer, vocab):
    tokens = tokenizer(text)
    ids = vocab.lookup_indices(tokens)
    tensor = torch.LongTensor(ids).unsqueeze(dim=0).to(device)

    output = model(tensor)

    prediction = torch.round(F.sigmoid(output))

    return prediction

In [None]:
model.load_state_dict(torch.load(model_dir))
model.to(device)

DeepLSTM(
  (embedding): Embedding(24897, 300, padding_idx=1)
  (lstm): LSTM(300, 128, num_layers=3, batch_first=True)
  (fc1): Linear(in_features=128, out_features=300, bias=True)
  (fc2): Linear(in_features=128, out_features=1, bias=True)
)

In [None]:
text = 'This film is terrible!'

prediction = inference(text, model, tokenizer, vocab)

print(prediction.item())

0.0


In [None]:
text = 'This film is great!'

prediction = inference(text, model, tokenizer, vocab)

print(prediction.item())

1.0


In [None]:
text = 'The best film I have ever seen!'

prediction = inference(text, model, tokenizer, vocab)

print(prediction.item())

1.0


In [None]:
text = "This film is not terrible, it's great!"

prediction = inference(text, model, tokenizer, vocab)

print(prediction.item())

0.0


# Limitation

padding 토큰 연산 제외 / dropout / validation