# Lab4: Sequential Data Modeling (RNN & Transformers)



## Colab Setup

In [16]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google'

In [None]:
"""
Change directory to where this file is located
"""
%cd '/content/drive/...'

In [None]:
!pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0
!pip install torchtext==0.16.0
!pip install portalocker>=2.0.0

In [1]:
import importlib
import data
importlib.reload(data)

ModuleNotFoundError: No module named 'data'

In [1]:
import math
import pickle
from pathlib import Path
import sys
import random
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms, datasets
import torchtext
from torch.utils.data import Dataset, DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter

import os

sys.path.append('../data/data')
from data import prepareData

In [2]:
%load_ext tensorboard

In [4]:
%reload_ext tensorboard

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

print("Using Pytorch version: {}, Device: {}".format(torch.__version__, DEVICE))
print("Using torchtext version: {}".format(torchtext.__version__))

Using Pytorch version: 2.1.0, Device: mps
Using torchtext version: 0.16.0


## RNNs for Sequential Data

### AG News Dataset

- News text dataset with **4 classes (news topics)**, single-labeled.
    - Word (1), Sports (2), Business (3), Sci/Tech (4)
- 120,000 training examples, 7,600 test examples
- Details: <a src="https://pytorch.org/text/stable/datasets.html#ag-news">https://pytorch.org/text/stable/datasets.html#ag-news</a>

In [4]:
train_data, test_data = torchtext.datasets.AG_NEWS(root='../data')
labels = [_, 'World', 'Sports', 'Business', 'Sci/Tech']

In [5]:
"""
Print the 1st element of the train data. Use the variable "labels" to get the label information.
"""
y, x = next(iter(train_data))
print(labels[y])
print(x)

Business
Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.


In [6]:
print(f"Classes in train data: {set([label for (label, text) in train_data])}")
print(f"Classes in test data: {set([label for (label, text) in test_data])}")



Classes in train data: {1, 2, 3, 4}
Classes in test data: {1, 2, 3, 4}


### Text Data Preprocessing

- Tokenizer
    - Splits the sentence inti lowercase **tokens**
    - Exclude **stopwords** (if necessary)
        - ex\) the, of, this, oh, ...    

In [6]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')
tokenizer("Hi, my name is Joonseok!")

['hi', ',', 'my', 'name', 'is', 'joonseok', '!']

In [7]:
"""
Tokenize the sentence with the "get_tokenizer" function.
"""

from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

sample_sentence = "I love MLDL1 class!" # Modify the sample and see what the function does.
tokenizer(sample_sentence)

['i', 'love', 'mldl1', 'class', '!']

- Vocabulary Encoder
    - Represents a token as **integer index**.
    - Vocabulary: tokens in train data
    - New tokens: replace with \<unk\>

In [8]:
from torchtext.vocab import build_vocab_from_iterator

def tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text) # yield: returns a generator instead of a list (faster when applying a function to a list)

encoder = build_vocab_from_iterator(tokens(train_data), specials=["<unk>"])
encoder.set_default_index(encoder["<unk>"])
encoder(tokenizer("Hi, my name is Joonseok <unk> !"))



[24104, 3, 1300, 951, 21, 0, 0, 764]

In [9]:
"""
Encode the tokens with the "build_vocab_from_iterator" function.

    - Reference: https://pytorch.org/text/stable/vocab.html#build-vocab-from-iterator
"""

from torchtext.vocab import build_vocab_from_iterator

def tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

train_iterator = tokens(train_data)
encoder = build_vocab_from_iterator(train_iterator, specials=["<unk>"])
encoder.set_default_index(encoder["<unk>"])

encoder(tokenizer("I love MLDL1 <unk> class !"))

[282, 2320, 0, 0, 2644, 764]

- Text preprocessing pipeline
    - Tokenizer: input sentence &rarr; tokens
    - Encoder: tokens &rarr; integer index

In [10]:
text_pipeline = lambda x: encoder(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

In [11]:
"""
Get the processed data of the 1st element in train_data using text pipeline and label_pipeline.
"""

text_pipeline = lambda x: encoder(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

print("Before preprocessing")
y, x = next(iter(train_data))
print(y)
print(x)

print("After preprocessing")
x = text_pipeline(x)
y = label_pipeline(y)
print(y)
print(x)

Before preprocessing
3
Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
After preprocessing
2
[431, 425, 1, 1605, 14838, 113, 66, 2, 848, 13, 27, 14, 27, 15, 50725, 3, 431, 374, 16, 9, 67507, 6, 52258, 3, 42, 4009, 783, 325, 1]


### Text Data Batch Preprocessing

- RNN can process <u>input with any length</u>!
- However, to pass a **batch of inputs** to RNN, each input in the batch should have the same length to be converted as a tensor.

### Customized collate_batch

In [12]:
iterator = iter(train_data)
sample_batch = []
for _ in range(8):
    sample_batch.append(next(iterator))

print(sample_batch)

[(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again."), (3, 'Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\\which has a reputation for making well-timed and occasionally\\controversial plays in the defense industry, has quietly placed\\its bets on another part of the market.'), (3, "Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\\about the economy and the outlook for earnings are expected to\\hang over the stock market next week during the depth of the\\summer doldrums."), (3, 'Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\\flows from the main pipeline in southern Iraq after\\intelligence showed a rebel militia could strike\\infrastructure, an oil official said on Saturday.'), (3, 'Oil prices soar to all-time record, posing new men

In [13]:
MAX_LEN = 32

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        if processed_text.size(0) >= MAX_LEN:
            processed_text = processed_text[:MAX_LEN]
        else:
            processed_text = torch.cat([processed_text, torch.zeros(MAX_LEN - processed_text.size(0))])
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list).long()
    return label_list.to(device), text_list.to(device)

- **Batch** of encoded tokens
    - Token length > MAX_LEN
        - Cut the tails.
    Token length < MAX_LEN
        - Zero-pad.

<br>

- **MAX_LEN** can be
    - Pre-defined
    - Minimum of each batch
    - Maximum of each batch
    - ...

In [14]:
"""
The collate_batch function below is designed to process AG News dataset. What is the problem of this function?

size가 batch는 전부 동일해야 하는데 데이터의 사이즈는 각 다르므로 equal size하게 만들어 줘야 함.
"""

###########################
MAX_LEN = 32

def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)

        ###############################
        # make every batch equal size #
        ###############################

        if processed_text.size(0) >= MAX_LEN:
            processed_text = processed_text[:MAX_LEN]
        else:
            processed_text = torch.cat([processed_text, torch.zeros(MAX_LEN - processed_text.size(0))])
            
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list).long()
    return label_list, text_list

collate_batch(sample_batch)

(tensor([2, 2, 2, 2, 2, 2, 2, 2]),
 tensor([[  431,   425,     1,  1605, 14838,   113,    66,     2,   848,    13,
             27,    14,    27,    15, 50725,     3,   431,   374,    16,     9,
          67507,     6, 52258,     3,    42,  4009,   783,   325,     1,     0,
              0,     0],
         [15874,  1072,   854,  1310,  4250,    13,    27,    14,    27,    15,
            929,   797,   320, 15874,    98,     3, 27657,    28,     5,  4459,
             11,   564, 52790,     8, 80617,  2125,     7,     2,   525,   241,
              3,    28],
         [   58,     8,   347,  4582,   151,    16,   738,    13,    27,    14,
             27,    15,  2384,   452,    92,  2059, 27360,     2,   347,     8,
              2,   738,    11,   271,    42,   240, 51953,    38,     2,   294,
            126,   112],
         [   70,  7376,    58,  1810,    29,   905,   537,  2846,    13,    27,
             14,    27,    15,   838,    39,  4978,    58, 68871,    29,     2,
          

### Text Classification Model

In [15]:
def train(model, train_loader, criterion, optimizer, scheduler=None):
    model.train()
    train_loss = 0
    correct = 0
    tqdm_bar = tqdm(train_loader)

    for label, text in tqdm_bar:
        text = text.to(DEVICE)
        label = label.to(DEVICE)
        optimizer.zero_grad()
        output = model(text)
        loss = criterion(output, label)
        loss.backward()
        train_loss += loss.item()
        prediction = output.max(1, keepdim = True)[1]
        correct += prediction.eq(label.view_as(prediction)).sum().item()
        optimizer.step()
        tqdm_bar.set_description("Epoch {} - train loss: {:.6f}".format(epoch, loss.item()))
    if scheduler is not None:
        scheduler.step()
    train_loss /= len(train_loader.dataset)
    train_acc = 100. * correct / len(train_loader.dataset)
    return train_loss, train_acc

def evaluate(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for label, text in tqdm(test_loader):
            text = text.to(DEVICE)
            label = label.to(DEVICE)
            output = model(text)
            test_loss += criterion(output, label).item()
            prediction = output.max(1, keepdim = True) [1]
            correct += prediction.eq(label.view_as(prediction)).sum().item()

    test_loss /= len(test_loader.dataset)
    test_acc = 100. * correct / len(test_loader.dataset)
    return test_loss, test_acc

In [16]:
from torchtext.data.functional import to_map_style_dataset

BATCH_SIZE = 1024

train_dataset = to_map_style_dataset(train_data)
test_dataset = to_map_style_dataset(test_data)
train_Dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

y, x = next(iter(train_Dataloader))

print(x.shape)
print(y.shape)

torch.Size([1024, 32])
torch.Size([1024])


In [17]:
"""
Use nn.Embedding() to get embedding vectors of x.
"""

vocab_size = len(encoder)
emb_size = 64

#######################
embedding = nn.Embedding(vocab_size, emb_size)
embedded_x = embedding(x)
#######################
print(embedded_x.shape)

torch.Size([1024, 32, 64])


### PyTorch API: Vanilla RNN

<a src='https://pytorch.org/docs/stable/generated/torch.nn.RNN.html'>'https://pytorch.org/docs/stable/generated/torch.nn.RNN.html</a>

In [19]:
import torch.nn as nn

rnn = nn.RNN(10, 20, 2)
input = torch.randn(5, 64, 10)
h0 = torch.randn(2, 64, 20)
output, hn = rnn(input, h0)

: 

: 

In [18]:
"""
Implement RNN layer using given variables.
"""

hidden_dim = 64
num_layers = 1

###########################
rnn = nn.RNN(
    input_size=emb_size,
    hidden_size=hidden_dim,
    num_layers=num_layers,
    batch_first=True
)
###########################

h_0 = torch.randn(num_layers, BATCH_SIZE, hidden_dim)
output, h_n = rnn(embedded_x, h_0)
print(output.shape) # -> torch.Size([BATCH_SIZE, seq_len, hidden_dim])
print(h_n.shape) # -> torch.Size([1, BATCH_SIZE, hidden_dim])

: 

: 

### Text Classification using RNN

In [272]:
"""
Implement TextClassificationModel.
"""
class TextClassificationModel(nn.Module):
    
    def __init__(self, vocab_size, hidden, embed, num_class, batch_size):
        """
        - Define self.embedding and self.rnn layer same as in Q(2) and Q(3).
        - self.fc layer will be used to map the average of output hidden units to the target classes.
        """
        super(TextClassificationModel, self).__init__()
        ################################
        self.embedding = nn.Embedding(vocab_size, embed)
        self.rnn = nn.RNN(input_size = embed, hidden_size=hidden, num_layers=1, nonlinearity='tanh', bias=True, batch_first=True)
        self.fc = nn.Linear(hidden, num_class)
        ################################
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, x):
        """
        1) Get embedded_x using self.embedding layer.
        2) Get output hidden units using self.rnn layer.
        3) Calculate the average of output hidden layers.
        4) Map the average of output hidden layers to the target classes.
        """
        ################################
        embed_x = self.embedding(x)
        out, h_n = self.rnn(embed_x)
        out = torch.mean(out, dim=1)

        out = self.fc(out)
        return out
        ################################

In [174]:
"""
Train the model and visualize your experiments with TensorBoard (Train/Validation Loss and Accuracy)
"""

################################
EPOCHS = 10
LR = 1
BATCH_SIZE = 64
num_class = len(set([label for (label, text) in train_data]))
vocab_size = len(encoder)
emsize = 64
hidden_dim = 32
################################

model = TextClassificationModel(vocab_size, hidden_dim, emsize, num_class, BATCH_SIZE).to(DEVICE)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

train_dataset = to_map_style_dataset(train_data)
test_dataset = to_map_style_dataset(test_data)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

################################
writer = SummaryWriter(log_dir="./logs")
for epoch in range(1, EPOCHS + 1):
    train_loss, train_acc = train(model, train_dataloader, criterion, optimizer, scheduler)
    val_loss, val_acc = evaluate(model, valid_dataloader, criterion)
    print("\n[EPOCH: {}], \tTrain Loss: {:.4f}, \tTrain Accuracy: {:.2f} %, \tValid Loss: {:.4f}, \tValid Accuracy: {:.2f} % \n", (epoch, train_loss, train_acc, val_loss, val_acc))
    writer.add_scalar("Loss/train", train_loss, epoch)
    writer.add_scalar("Accuracy/train", train_acc, epoch)
    writer.add_scalar("Loss/val", val_loss, epoch)
    writer.add_scalar("Accuracy/val", val_acc, epoch)
writer.flush()
writer.close()
################################

  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]


[EPOCH: {}], 	Train Loss: {:.4f}, 	Train Accuracy: {:.2f} %, 	Valid Loss: {:.4f}, 	Valid Accuracy: {:.2f} % 
 (1, 0.011447340461363396, 70.11333333333333, 0.007273229789969168, 83.61842105263158)


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]


[EPOCH: {}], 	Train Loss: {:.4f}, 	Train Accuracy: {:.2f} %, 	Valid Loss: {:.4f}, 	Valid Accuracy: {:.2f} % 
 (2, 0.005933680584405859, 86.77333333333333, 0.006343153954336518, 85.67105263157895)


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]


[EPOCH: {}], 	Train Loss: {:.4f}, 	Train Accuracy: {:.2f} %, 	Valid Loss: {:.4f}, 	Valid Accuracy: {:.2f} % 
 (3, 0.005679939291688303, 87.42083333333333, 0.006291553011458171, 85.64473684210526)


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]


[EPOCH: {}], 	Train Loss: {:.4f}, 	Train Accuracy: {:.2f} %, 	Valid Loss: {:.4f}, 	Valid Accuracy: {:.2f} % 
 (4, 0.005654195760438839, 87.46, 0.006310545302143223, 85.67105263157895)


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]


[EPOCH: {}], 	Train Loss: {:.4f}, 	Train Accuracy: {:.2f} %, 	Valid Loss: {:.4f}, 	Valid Accuracy: {:.2f} % 
 (5, 0.005650984058404962, 87.46583333333334, 0.006300391478365973, 85.6842105263158)


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]


[EPOCH: {}], 	Train Loss: {:.4f}, 	Train Accuracy: {:.2f} %, 	Valid Loss: {:.4f}, 	Valid Accuracy: {:.2f} % 
 (6, 0.005650680286809802, 87.46583333333334, 0.006300517773549808, 85.6842105263158)


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]


[EPOCH: {}], 	Train Loss: {:.4f}, 	Train Accuracy: {:.2f} %, 	Valid Loss: {:.4f}, 	Valid Accuracy: {:.2f} % 
 (7, 0.005650652291066945, 87.46583333333334, 0.006297598161587589, 85.6842105263158)


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]


[EPOCH: {}], 	Train Loss: {:.4f}, 	Train Accuracy: {:.2f} %, 	Valid Loss: {:.4f}, 	Valid Accuracy: {:.2f} % 
 (8, 0.005650650832615792, 87.46583333333334, 0.006301394701004028, 85.6842105263158)


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]


[EPOCH: {}], 	Train Loss: {:.4f}, 	Train Accuracy: {:.2f} %, 	Valid Loss: {:.4f}, 	Valid Accuracy: {:.2f} % 
 (9, 0.005650650793189804, 87.46583333333334, 0.00629421738417525, 85.6842105263158)


  0%|          | 0/1875 [00:00<?, ?it/s]

  0%|          | 0/119 [00:00<?, ?it/s]


[EPOCH: {}], 	Train Loss: {:.4f}, 	Train Accuracy: {:.2f} %, 	Valid Loss: {:.4f}, 	Valid Accuracy: {:.2f} % 
 (10, 0.005650650782883167, 87.46583333333334, 0.0063026312150453265, 85.6842105263158)


In [219]:
%tensorboard --logdir ./logs 

Reusing TensorBoard on port 6006 (pid 96588), started 1 day, 0:53:56 ago. (Use '!kill 96588' to kill it.)

## Pytorch LSTM

- `torch.nn.Embedding, torch.nn.LSTM`[[link]](https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html)
- For simplicity, we'll use single layer LSTM for encoder & decoder.

<p align="center">
<img src="./img/lstm.png">
</p>

- input_size = length of x_t
- hidden_size = dim of h_t
- Check out `__init__`

<p align="center">
<img src="./img/lstm_2.png">
</p>

<p align="center">
<img src="./img/lstm_input.png">
</p>

<p align="center">
<img src="./img/lstm_output.png">
</p>

- For simplicity, we'll use single layer LSTM for encoder & decoder.

<p align="center">
<img src="./img/lstm_3.png">
</p>

- c_t-1.shape = c_t.shape
- h_t-1.shape = h_t.shape

In [246]:
batch_size = 64
emb_dim = 512
hid_dim = 256
max_length = 10

print(embedded_x.shape) #-> torch.Size([64, 10, 512])
print(hidden_0.shape) #-> torch.Size([1, 64, 256])
print(cell_0.shape) #-> torch.Size([1, 64, 256])

lstm = nn.LSTM(input_size=emb_dim, hidden_size=hid_dim, batch_first=True)
hiddens, (hidden, cell) = lstm(embedded_X, (hidden_0, cell_0))

print(hiddens.shape) # (A)
print(hidden.shape) # (B)
print(cell.shape) # (C)

torch.Size([1024, 32, 64])


NameError: name 'hidden_0' is not defined

- Output contains (h_0, h_1, ..., h_n) 
    - (L, N, D*Hout)when `batch_first=False`
    - **(N, L, D*Hout) when `batch_first=True`**
    - Containing the output features (`h_t`)from the last layser of the LSTM, for each t.

In [113]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.2 (from transformers)
  Downloading huggingface_hub-0.24.6-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.7.24-cp38-cp38-macosx_10_9_x86_64.whl.metadata (40 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting safetensors>=0.4.1 (from transformers)
  Downloading safetensors-0.4.4-cp38-cp38-macosx_10_12_x86_64.whl.metadata (3.8 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp38-cp38-macosx_10_12_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [273]:
SEED = 1111

np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### Dataset

[About the Dataset]
- Translation Task
    - French(src) -> English(trg)
    - Sequence to Sequence
- Language tokens
    - src: 4,345 words in our dictionary
    - trg: 2,803 words in out dictionary
- Set max length to 10
- 10,599 pairs

In [274]:
MAX_LENGTH = 10
BATCH_SIZE = 64

TRAIN_RATIO = 0.7 # train dataset ratio, should be a float in (0, 0.8]
VALID_RATIO = 0.8 - TRAIN_RATIO

SOS_token = 0
EOS_token = 1

In [275]:
print(prepareData.__code__.co_varnames)
# ['lang1', 'lang2', 'max_length', 'reverse']


('lang1', 'lang2', 'max_length', 'reverse', 'input_lang', 'output_lang', 'pairs', 'pair')


In [276]:
class TranslateDataset(Dataset):
    def __init__(self, max_length=10, fra2eng=True):

        self.input_lang, self.output_lang, self.pairs = prepareData('eng', 'fra', max_length = max_length, reverse=fra2eng)
        
        self.max_length=max_length

        self.input_lang.addWord('PAD')
        self.output_lang.addWord('PAD')
        self.input_lang_pad = self.input_lang.word2index['PAD']
        self.output_lang_pad = self.output_lang.word2index['PAD']

    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, idx):
        pair = self.pairs[idx]
        x, y = self._tensorsFromPair(pair)
        return x, y
    
    def _tensorsFromSentence(self, lang, sentence):
        indexes = [lang.word2index[word] for word in sentence.split(' ')]
        indexes.append(EOS_token)
        return torch.tensor(indexes, dtype=torch.long).view(-1, 1)
    
    def _tensorsFromPair(self, pair):
        input_tensor = self._tensorsFromSentence(self.input_lang, pair[0])
        target_tensor = self._tensorsFromSentence(self.output_lang, pair[1])
        return (input_tensor, target_tensor)
    
    def collate_fn(self, data):
        x_batch = []; y_batch = []

        for x, y in data:
            if x.shape[0] < self.max_length-1:
                x = torch.cat([x, self.input_lang_pad*torch.ones((self.max_length-1 - x.shape[0], 1), dtype=x.dtype)])
            elif x.shape[0] > self.max_length-1:
                x = x[:self.max_length-1]
            if y.shape[0] < self.max_length-1:
                y = torch.cat([y, self.output_lang_pad*torch.ones((self.max_length-1 - y.shape[0], 1), dtype=y.dtype)])
            elif y.shape[0] > self.max_length-1:
                y = y[:self.max_length-1]

            x_batch.append(torch.cat([torch.tensor([SOS_token]), x.squeeze(1)]))
            y_batch.append(torch.cat([torch.tensor([SOS_token]), y.squeeze(1)]))

        return torch.stack(x_batch), torch.stack(y_batch)

In [277]:
dataset = TranslateDataset(max_length=MAX_LENGTH)

print("\n")
print("This is data example")
print(random.choice(dataset.pairs))

train_size = int(len(dataset)*TRAIN_RATIO)
valid_size = int(len(dataset)*VALID_RATIO)
train_data, valid_data, test_data = random_split(dataset, [train_size, valid_size, len(dataset)-(train_size+valid_size)],)
print("\n")
print(f"This is dataset_size: {len(dataset)}")
print(f"train_size: {train_size}")
print(f"valid_data: {valid_size}")
print(f"test_data: {len(test_data)}")

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, collate_fn=dataset.collate_fn, shuffle=True)
valid_dataloader = DataLoader(valid_data, batch_size=BATCH_SIZE, collate_fn=dataset.collate_fn, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, collate_fn=dataset.collate_fn, shuffle=True)

Reading lines...
Read 135842 sentence pairs
Trimmed to 10599 sentence pairs
Counting words...
Counted words:
fra 4345
eng 2803


This is data example
['tu travailles dur .', 'you re working hard .']


This is dataset_size: 10599
train_size: 7419
valid_data: 1059
test_data: 2121
