# Lab4: Sequential Data Modeling (RNN & Transformers)



## Colab Setup

In [16]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
"""
Change directory to where this file is located
"""
%cd '/content/drive/...'

In [19]:
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0
!pip install torchtext==0.16.0
!pip install portalocker>=2.0.0

Collecting torchvision==0.16.0
  Downloading torchvision-0.16.0-cp38-cp38-macosx_10_13_x86_64.whl.metadata (6.6 kB)
Collecting torchaudio==2.1.0
  Downloading torchaudio-2.1.0-cp38-cp38-macosx_10_13_x86_64.whl.metadata (5.7 kB)
Downloading torchvision-0.16.0-cp38-cp38-macosx_10_13_x86_64.whl (1.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m[31m5.3 MB/s[0m eta [36m0:00:01[0mm
[?25hDownloading torchaudio-2.1.0-cp38-cp38-macosx_10_13_x86_64.whl (3.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m:01[0mm
[?25hInstalling collected packages: torchvision, torchaudio
  Attempting uninstall: torchvision
    Found existing installation: torchvision 0.17.2
    Uninstalling torchvision-0.17.2:
      Successfully uninstalled torchvision-0.17.2
  Attempting uninstall: torchaudio
    Found exist

In [4]:
import math
import pickle
from pathlib import Path
import sys
import random
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, datasets
import torchtext
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

# from data.data import prepareData

In [5]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [6]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

print("Using Pytorch version: {}, Device: {}".format(torch.__version__, DEVICE))
print("Using torchtext version: {}".format(torchtext.__version__))

Using Pytorch version: 2.1.0, Device: mps
Using torchtext version: 0.16.0


## RNNs for Sequential Data

### AG News Dataset

- News text dataset with **4 classes (news topics)**, single-labeled.
    - Word (1), Sports (2), Business (3), Sci/Tech (4)
- 120,000 training examples, 7,600 test examples
- Details: <a src="https://pytorch.org/text/stable/datasets.html#ag-news">https://pytorch.org/text/stable/datasets.html#ag-news</a>

In [7]:
train_data, test_data = torchtext.datasets.AG_NEWS(root='../data')
labels = [_, 'World', 'Sports', 'Business', 'Sci/Tech']

In [8]:
"""
Print the 1st element of the train data. Use the variable "labels" to get the label information.
"""
y, x = next(iter(train_data))
print(labels[y])
print(x)

Business
Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.


In [9]:
print(f"Classes in train data: {set([label for (label, text) in train_data])}")
print(f"Classes in test data: {set([label for (label, text) in test_data])}")



Classes in train data: {1, 2, 3, 4}
Classes in test data: {1, 2, 3, 4}


### Text Data Preprocessing

- Tokenizer
    - Splits the sentence inti lowercase **tokens**
    - Exclude **stopwords** (if necessary)
        - ex\) the, of, this, oh, ...    

In [10]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')
tokenizer("Hi, my name is Joonseok!")

['hi', ',', 'my', 'name', 'is', 'joonseok', '!']

In [12]:
"""
Tokenize the sentence with the "get_tokenizer" function.
"""

from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

sample_sentence = "I love MLDL1 class!" # Modify the sample and see what the function does.
tokenizer(sample_sentence)

['i', 'love', 'mldl1', 'class', '!']

- Vocabulary Encoder
    - Represents a token as **integer index**.
    - Vocabulary: tokens in train data
    - New tokens: replace with \<unk\>

In [13]:
from torchtext.vocab import build_vocab_from_iterator

def tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text) # yield: returns a generator instead of a list (faster when applying a function to a list)

encoder = build_vocab_from_iterator(tokens(train_data), specials=["<unk>"])
encoder.set_default_index(encoder["<unk>"])
encoder(tokenizer("Hi, my name is Joonseok <unk> !"))

[24104, 3, 1300, 951, 21, 0, 0, 764]

In [14]:
"""
Encode the tokens with the "build_vocab_from_iterator" function.

    - Reference: https://pytorch.org/text/stable/vocab.html#build-vocab-from-iterator
"""

from torchtext.vocab import build_vocab_from_iterator

def tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

train_iterator = tokens(train_data)
encoder = build_vocab_from_iterator(train_iterator, specials=["<unk>"])
encoder.set_default_index(encoder["<unk>"])

encoder(tokenizer("I love MLDL1 <unk> class !"))

[282, 2320, 0, 0, 2644, 764]

- Text preprocessing pipeline
    - Tokenizer: input sentence &rarr; tokens
    - Encoder: tokens &rarr; integer index

In [15]:
text_pipeline = lambda x: encoder(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [17]:
"""
Get the processed data of the 1st element in train_data using text pipeline and label_pipeline.
"""

text_pipeline = lambda x: encoder(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

print("Before preprocessing")
y, x = next(iter(train_data))
print(y)
print(x)

print("After preprocessing")
x = text_pipeline(x)
y = label_pipeline(y)
print(y)
print(x)

Before preprocessing
3
Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
After preprocessing
2
[431, 425, 1, 1605, 14838, 113, 66, 2, 848, 13, 27, 14, 27, 15, 50725, 3, 431, 374, 16, 9, 67507, 6, 52258, 3, 42, 4009, 783, 325, 1]


### Text Data Batch Preprocessing

- RNN can process <u>input with any length</u>!
- However, to pass a **batch of inputs** to RNN, each input in the batch should have the same length to be converted as a tensor.

### Customized collate_batch

In [18]:
iterator = iter(train_data)
sample_batch = []
for _ in range(8):
    sample_batch.append(next(iterator))

print(sample_batch)

[(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."), (3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.'), (3, "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums."), (3, 'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.'), (3, 'Oil prices soar to all-time record, posing new men

In [19]:
MAX_LEN = 32

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        if processed_text.size(0) >= MAX_LEN:
            processed_text = processed_text[:MAX_LEN]
        else:
            processed_text = torch.cat([processed_text, torch.zeros(MAX_LEN - processed_text.size(0))])
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list).long()
    return label_list.to(device), text_list.to(device)

- **Batch** of encoded tokens
    - Token length > MAX_LEN
        - Cut the tails.
    Token length < MAX_LEN
        - Zero-pad.

<br>

- **MAX_LEN** can be
    - Pre-defined
    - Minimum of each batch
    - Maximum of each batch
    - ...

In [20]:
"""
The collate_batch function below is designed to process AG News dataset. What is the problem of this function?

size가 batch는 전부 동일해야 하는데 데이터의 사이즈는 각 다르므로 equal size하게 만들어 줘야 함.
"""

###########################
MAX_LEN = 32

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)

        ###############################
        # make every batch equal size #
        ###############################

        if processed_text.size(0) >= MAX_LEN:
            processed_text = processed_text[:MAX_LEN]
        else:
            processed_text = torch.cat([processed_text, torch.zeros(MAX_LEN - processed_text.size(0))])
            
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list).long()
    return label_list, text_list

collate_batch(sample_batch)

(tensor([2, 2, 2, 2, 2, 2, 2, 2]),
 tensor([[  431,   425,     1,  1605, 14838,   113,    66,     2,   848,    13,
             27,    14,    27,    15, 50725,     3,   431,   374,    16,     9,
          67507,     6, 52258,     3,    42,  4009,   783,   325,     1,     0,
              0,     0],
         [15874,  1072,   854,  1310,  4250,    13,    27,    14,    27,    15,
            929,   797,   320, 15874,    98,     3, 27657,    28,     5,  4459,
             11,   564, 52790,     8, 80617,  2125,     7,     2,   525,   241,
              3,    28],
         [   58,     8,   347,  4582,   151,    16,   738,    13,    27,    14,
             27,    15,  2384,   452,    92,  2059, 27360,     2,   347,     8,
              2,   738,    11,   271,    42,   240, 51953,    38,     2,   294,
            126,   112],
         [   70,  7376,    58,  1810,    29,   905,   537,  2846,    13,    27,
             14,    27,    15,   838,    39,  4978,    58, 68871,    29,     2,
          

### Text Classification Model

In [21]:
def train(model, train_loader, criterion, optimizer, scheduler=None):
    model.train()
    train_loss = 0
    correct = 0
    tqdm_bar = tqdm(train_loader)

    for label, text in tqdm_bar:
        text = text.to(DEVICE)
        label = label.to(DEVICE)
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output, label)
        loss.backward()
        train_loss += loss.item()
        prediction = output.max(1, keepdim = True)(1)
        correct += prediction.eq(label.view_as(prediction)).sum().item()
        optimizer.step()
        tqdm_bar.set_description("Epoch {} - train loss: {:.6f}".format(epoch, loss.item()))
    if scheduler is not None:
        scheduler.step()
    train_loss /= len(train_loader.dataset)
    train_acc = 100. * correct / len(train_loader.dataset)
    return train_loss, train_acc

def evaluate(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for label, text in tqdm(test_loader):
            text = text.to(DEVICE)
            label = label.to(DEVICE)
            output = model(text)
            test_loss += criterion(output, label).item()
            prediction = output.max(1, keepdim = True) [1]
            correct += prediction.eq(label.view_as(prediction)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_acc = 100. * correct / len(test_loader.dataset)
    return test_loss, test_acc

In [22]:
from torchtext.data.functional import to_map_style_dataset

BATCH_SIZE = 1024

train_dataset = to_map_style_dataset(train_data)
test_dataset = to_map_style_dataset(test_data)
train_Dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

y, x = next(iter(train_Dataloader))

print(x.shape)
print(y.shape)

torch.Size([1024, 32])
torch.Size([1024])


In [23]:
"""
Use nn.Embedding() to get embedding vectors of x.
"""

vocab_size = len(encoder)
emb_size = 64

#######################
embedding = nn.Embedding(vocab_size, emb_size)
embedded_x = embedding(x)
#######################
print(embedded_x.shape)

torch.Size([1024, 32, 64])


### PyTorch API: Vanilla RNN

<a src='https://pytorch.org/docs/stable/generated/torch.nn.RNN.html'>'https://pytorch.org/docs/stable/generated/torch.nn.RNN.html</a>

In [None]:
import torch.nn as nn

rnn = nn.RNN(10, 20, 2)
input = torch.randn(5, 64, 10)
h0 = torch.randn(2, 64, 20)
output, hn = rnn(input, h0)

In [None]:
"""
Implement RNN layer using given variables.
"""

hidden_dim = 64
num_layers = 1

###########################
rnn = nn.RNN(
    input_size=emb_size,
    hidden_size=hidden_dim,
    num_layers=num_layers,
    batch_first=True
)
###########################

h_0 = torch.randn(num_layers, BATCH_SIZE, hidden_dim)
output, h_n = rnn(embedded_x, h_0)
print(output.shape) # -> torch.Size([BATCH_SIZE, seq_len, hidden_dim])
print(h_n.shape) # -> torch.Size([1, BATCH_SIZE, hidden_dim])

### Text Classification using RNN

In [None]:
class TextClassificationModel(nn.Module):
    
    def __init__(self, vocab_size, hidden, embed, num_class, batch_size):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed)
        self.rnn = nn.RNN(input_size = embed, hidden_size=hidden, num_layers=1, nonlinearity='tanh', bias=True, batch_first=True)
        self.fc = nn.Linear(hidden, num_class)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, x):
        x = self.embedding(x)
        x, h = self.rnn(x)
        x = torch.mean(x, dim=1)
        return self.fc(x)

## Pytorch LSTM

- `torch.nn.Embedding, torch.nn.LSTM`[[link]](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)
- For simplicity, we'll use single layer LSTM for encoder & decoder.

<p align="center">
<img src="./img/lstm.png">
</p>

- input_size = length of x_t
- hidden_size = dim of h_t
- Check out `__init__`

<p align="center">
<img src="./img/lstm_2.png">
</p>

<p align="center">
<img src="./img/lstm_input.png">
</p>

<p align="center">
<img src="./img/lstm_output.png">
</p>

- For simplicity, we'll use single layer LSTM for encoder & decoder.

<p align="center">
<img src="./img/lstm_3.png">
</p>

- c_t-1.shape = c_t.shape
- h_t-1.shape = h_t.shape

In [None]:
batch_size = 64
emb_dim = 512
hid_dim = 256
max_length = 10

print(embedded_x.shape) #-> torch.Size([64, 10, 512])
print(hidden_0.shape) #-> torch.Size([1, 64, 256])
print(cell_0.shape) #-> torch.Size([1, 64, 256])

lstm = nn.LSTM(input_size=emb_dim, hidden_size=hid_dim, batch_first=True)
hiddens, (hidden, cell) = lstm(embedded_X, (hidden_0, cell_0))

print(hiddens.shape) # (A)
print(hidden.shape) # (B)
print(cell.shape) # (C)

NameError: name 'embedded_x' is not defined

- Output contains (h_0, h_1, ..., h_n) 
    - (L, N, D*Hout)when `batch_first=False`
    - **(N, L, D*Hout) when `batch_first=True`**
    - Containing the output features (`h_t`)from the last layser of the LSTM, for each t.