In [1]:
import sys
sys.path.append('../')
sys.path.append('../tokenization')

# Data Preparation

## DownLoad to .txt

In [2]:
import os
import requests

def get_iwsltenvi_data(store_dir: str = './'):
    train_en_url = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.en'
    train_vi_url = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/train.vi'
    test_en_url = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2013.en'
    test_vi_url = 'https://nlp.stanford.edu/projects/nmt/data/iwslt15.en-vi/tst2013.vi'

    store_pth = f'{store_dir}data/iwsltenvi/'
    os.makedirs(store_pth, exist_ok=True)
    print('The Data would be stored in: ', store_pth)

    if not os.path.exists(f'{store_pth}train_en.txt'):
        train_en = requests.get(train_en_url).text
        with open(f'{store_pth}train_en.txt', 'w') as f:
            f.write(train_en)
    
    if not os.path.exists(f'{store_pth}train_vi.txt'):
        train_vi = requests.get(train_vi_url).text
        with open(f'{store_pth}train_vi.txt', 'w') as f:
            f.write(train_vi)
    
    if not os.path.exists(f'{store_pth}test_en.txt'):
        test_en = requests.get(test_en_url).text
        with open(f'{store_pth}test_en.txt', 'w') as f:
            f.write(test_en)
    
    if not os.path.exists(f'{store_pth}test_vi.txt'):
        test_vi = requests.get(test_vi_url).text
        with open(f'{store_pth}test_vi.txt', 'w') as f:
            f.write(test_vi)
    
    print('Done!')

In [3]:
get_iwsltenvi_data()

The Data would be stored in:  ./data/iwsltenvi/
Done!


## Load .txt and get corpus

In [2]:
import html

# Load & Clean the data (Convert HTML-encoded characters to normal)
def load_iwsltenvi_data(train: bool = True,
                        test: bool = True,
                        data_dir: str = './data/iwsltenvi/'):
    
    train_data = {}
    test_data = {}
    # Load the data
    if train:
        with open(f'{data_dir}train_en.txt', 'r') as f:
            en_text = html.unescape(f.read()).split('\n')
        with open(f'{data_dir}train_vi.txt', 'r') as f:
            vi_text = html.unescape(f.read()).split('\n')
        train_data['en'] = en_text
        train_data['vi'] = vi_text
        
    if test:
        with open(f'{data_dir}test_en.txt', 'r') as f:
            en_text += html.unescape(f.read()).split('\n')
        with open(f'{data_dir}test_vi.txt', 'r') as f:
            vi_text += html.unescape(f.read()).split('\n')

        test_data['en'] = en_text
        test_data['vi'] = vi_text
    
    return train_data, test_data

In [3]:
train_data, test_data = load_iwsltenvi_data()

In [None]:
# Get Corpus
corpus = []
corpus.extend(train_data['en'])
corpus.extend(train_data['vi'])
corpus.extend(test_data['en'])
corpus.extend(test_data['vi'])

# Tokenizer

In [1]:
import sys
sys.path.append('../')
sys.path.append('../tokenization')

import html

from tokenization.tools import *
from tokenization.tokenizer import *

# Load & Clean the data (Convert HTML-encoded characters to normal)
def load_iwsltenvi_data(train: bool = True,
                        test: bool = True,
                        data_dir: str = './data/iwsltenvi/'):
    
    train_data = {}
    test_data = {}
    # Load the data
    if train:
        with open(f'{data_dir}train_en.txt', 'r') as f:
            en_text = html.unescape(f.read()).split('\n')
        with open(f'{data_dir}train_vi.txt', 'r') as f:
            vi_text = html.unescape(f.read()).split('\n')
        train_data['en'] = en_text
        train_data['vi'] = vi_text
        
    if test:
        with open(f'{data_dir}test_en.txt', 'r') as f:
            en_text += html.unescape(f.read()).split('\n')
        with open(f'{data_dir}test_vi.txt', 'r') as f:
            vi_text += html.unescape(f.read()).split('\n')

        test_data['en'] = en_text
        test_data['vi'] = vi_text
    
    return train_data, test_data

train_data, test_data = load_iwsltenvi_data()



## Train

In [None]:
type(corpus)

In [None]:
bpe_tokenizer = BPETokenizer(corpus, 16000)
bpe_tokenizer.train()

## Save & Load

In [None]:
# save
bpe_tokenizer.save(
    name = 'iwslt2013_tokenizer',
    dir_pth = './' 
)

In [5]:
# Load
bpe_tokenizer = BPETokenizer()
bpe_tokenizer.load(file_path = './iwslt2013_tokenizer.pkl')

In [8]:
bpe_tokenizer.pad_id

0

# DataSet

In [1]:
import sys
sys.path.append('../')
sys.path.append('../tokenization')

import html

from tokenization.tools import *
from tokenization.tokenizer import *

# Load & Clean the data (Convert HTML-encoded characters to normal)
def load_iwsltenvi_data(train: bool = True,
                        test: bool = True,
                        data_dir: str = './data/iwsltenvi/'):
    
    train_data = {}
    test_data = {}
    # Load the data
    if train:
        with open(f'{data_dir}train_en.txt', 'r') as f:
            en_text = html.unescape(f.read()).split('\n')
        with open(f'{data_dir}train_vi.txt', 'r') as f:
            vi_text = html.unescape(f.read()).split('\n')
        train_data['en'] = en_text
        train_data['vi'] = vi_text
        
    if test:
        with open(f'{data_dir}test_en.txt', 'r') as f:
            en_text += html.unescape(f.read()).split('\n')
        with open(f'{data_dir}test_vi.txt', 'r') as f:
            vi_text += html.unescape(f.read()).split('\n')

        test_data['en'] = en_text
        test_data['vi'] = vi_text
    
    return train_data, test_data

train_data, test_data = load_iwsltenvi_data()

# Load
bpe_tokenizer = BPETokenizer()
bpe_tokenizer.load(file_path = './iwslt2013_tokenizer.pkl')

In [2]:
from modules import *
from torch.utils.data import DataLoader

In [3]:
batch_size = 8
num_workers = 2
pin_memory = True

train_dataset = IWSLTDataset(tokenizer = bpe_tokenizer, 
                             data = train_data)
train_loader = DataLoader(
    train_dataset,
    batch_size = batch_size,
    shuffle = True,
    num_workers = num_workers,
    pin_memory = pin_memory
)

In [4]:
data_iter = iter(train_loader)

In [5]:
batch = next(data_iter)

In [7]:
batch = next(data_iter)
src, trg = batch
print('src: Type: ', str(type(src)), 'Shape: ', str(len(src)))
print('trg: Type: ', str(type(trg)), 'Shape: ', str(len(trg)))
for i in range(len(src)):
    print(src[i])
    print(trg[i])

src: Type:  <class 'list'> Shape:  8
trg: Type:  <class 'list'> Shape:  8
Maybe we don 't all quite move the same way .
Có thể là chúng ta không hoàn toàn di chuyển theo cùng một cách .
So the first thing we did was , we compared , what 's different in the brain of someone with depression and someone who is normal , and what we did was PET scans to look at the blood flow of the brain , and what we noticed is that in patients with depression compared to normals , areas of the brain are shut down , and those are the areas in blue .
Vậy điều đầu tiên chúng tôi làm được là chúng tôi so sánh sự khác nhau trong não bộ của người trầm cảm với não bộ của người khoẻ mạnh , và điều mà chúng tôi làm là chụp cắt lớp phát xạ để xem dòng chảy máu của não bộ , và điều mà chúng tôi nhận ra là não bệnh nhân trầm cảm khi so sánh với não bộ người khoẻ mạnh thì các vùng trên não đóng lại , và những vùng đó được biểu thị bằng màu xanh .
I kind of think of it as this cartoon devil or angel sitting on our sho

# Train

## I/O Debug

In [1]:
# Config a Model
from transformer import *

model = Transformer(embed_dim = 512, 
                    s_vocab_size = 16001, 
                    t_vocab_size = 16001, 
                    max_seq_len = 256, 
                    num_layers = 6, 
                    expansion_factor = 4,
                    n_heads = 8)

In [9]:
# Dynamic Padding with max_len
def dynamic_padding(batch, 
                    pad_id: int = 0, 
                    max_seq_len: int = 256):
    # Find the longest sequence in the batch
    max_len = min(max_seq_len, max([len(x) for x in batch]))
    
    for i in range(len(batch)):
        # Truncate the sequence > max_len
        batch[i] = batch[i][:max_len]

        # Fill the rest of the sequence with padding
        batch[i] = batch[i] + [pad_id] * (max_len - len(batch[i]))

    return batch

In [2]:
in_tensor = torch.tensor([[1, 2, 3], [4, 5, 6]])
in_d_tensor = torch.tensor([[1, 22, 3], [14, 5, 6]])

opt = model(in_tensor, in_d_tensor)

In [6]:
import torch.nn as nn

criterion = nn.CrossEntropyLoss()


In [8]:
loss = criterion(opt.view(-1, opt.size(-1)), in_d_tensor.view(-1))

In [14]:
loss.mean()

tensor(9.6804, grad_fn=<MeanBackward0>)

## Train Loop

# Test