# import libraries

In [15]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import torch
import torch.nn as nn
from transformers import PreTrainedTokenizerFast

In [16]:
ds = load_dataset('thainq107/iwslt2015-en-vi')

In [17]:
ds['train']['en']

['Rachel Pike : The science behind a climate headline',
 'In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .',
 'I &apos;d like to talk to you today about the scale of the scientific effort that goes into making the headlines you see in the paper .',
 'Headlines that look like this when they have to do with climate change , and headlines that look like this when they have to do with air quality or smog .',
 'They are both two branches of the same field of atmospheric science .',
 'Recently the headlines looked like this when the Intergovernmental Panel on Climate Change , or IPCC , put out their report on the state of understanding of the atmospheric system .',
 'That report was written by 620 scientists from 40 countries .',
 'They wrote almost a thousand pages

In [18]:
import os
from tokenizers import Tokenizer, pre_tokenizers, trainers, models

tokenizer_en= Tokenizer(models.WordLevel(unk_token='<unk>'))
tokenizer_vi = Tokenizer(models.WordLevel(unk_token='<unk>'))

tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace()
tokenizer_vi.pre_tokenizer= pre_tokenizers.Whitespace()

trainer = trainers.WordLevelTrainer(
    vocab=10000,
    min_frequency=2,
    special_tokens=['<pad>','<unk>','<bos>','<eos>']
)

tokenizer_en.train_from_iterator(ds['train']['en'], trainer)
tokenizer_vi.train_from_iterator(ds['train']['vi'],trainer)

tokenizer_en.save('tokenizer_en.json')
tokenizer_vi.save('tokenizer_vi.json')


In [19]:
tokenizer_en = PreTrainedTokenizerFast(
    tokenizer_file ='tokenizer_en.json',
    unk_token='<unk>',pad_token='<pad>', bos_token= '<bos>', eos_token='<eos>'
)

tokenizer_vi= PreTrainedTokenizerFast(
    tokenizer_file = 'tokenizer_vi.json',
    unk_token='<unk>', pad_token='<pad>', bos_token='<bos>',eos_token='<eos'
)

In [6]:
MAX_LEN= max([len(text) for text in ds['train']['vi']])

In [7]:
MAX_LEN=50

In [8]:
def preprocess_function(examples):
    src_texts = examples['en']
    tgt_texts= ['<bos>'+ sent + '<eos>' for sent in examples['vi']]
    
    src_encodings= tokenizer_en(
        src_texts, padding='max_length', truncation=True, max_length= MAX_LEN
    )['input_ids']
    
    tgt_encodings =tokenizer_vi(
        tgt_texts, padding='max_length', truncation=True, max_length=MAX_LEN
    )['input_ids']
    
    return {
        'input_ids': src_encodings,
        'labels': tgt_encodings
    }

In [9]:
preprocessed_ds = ds.map(preprocess_function,batched=True)

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map: 100%|██████████| 133317/133317 [00:12<00:00, 10618.10 examples/s]
Map: 100%|██████████| 1268/1268 [00:00<00:00, 9657.84 examples/s] 
Map: 100%|██████████| 1268/1268 [00:00<00:00, 4937.85 examples/s]


In [None]:
from transformers import PreTrainedModel, PretrainedConfig

class Seq2SeqRNNConfig(PretrainedConfig):
    def __init__(self, vocab_size_src=10000, vocab_size_tgt=10000, embedding_dim=128, hidden_size=128, dropout=0.1
             ,    **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_src= vocab_size_src
        self.vocab_size_tgt= vocab_size_tgt
        self.embedding_dim = embedding_dim
        self.hidden_size= hidden_size
        self.dropout = dropout
    

class EncoderRNN(nn.Module):
    def __init__(self, input_size, embedding_dim, hidden_size, dropout_p= 0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, embedding_dim)
        self.gru = nn.GRU(input_size, embedding_dim)
        self.dropout = nn.Dropout(dropout_p)
        
    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        
        output,hidden = self.gru(embedded)
        return output, hidden
    
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, embedding_dim, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding= nn.Embedding(output_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_size, batch_first =True)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, input, hidden):
        embedded= self.embedding(input)
        output, hidden = self.gru(embedded,hidden)
        output = self.out(output)
        return output, hidden
    
class Seq2SeqRNNModel(PreTrainedModel):
    config_class = Seq2SeqRNNConfig
    
    def __init__(self, config, tokenizer_en):
        super().__init__(config)
        self.encoder = EncoderRNN(
            config.vocab_size_src, config.embedding_dim,
            config.hidden_size, config.dropout
        )
        self.decoder = DecoderRNN(config.hidden_size, config.embedding_dim, config.vocab_size_tgt)
        self.BOS_IDX= tokenizer_en.bos_token_id
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=0)
    
    def forward(self, input_ids, labels):
        tgt_input = labels[:, :-1]
        tgt_output = labels[:,1:]
        
        batch_size, seq_len = tgt_output.shape
        decoder_input = torch.full((batch_size, 1),self.BOS_IDX, dtype=torch.long).to(input_ids.device)
        
        encoder_output, decoder_hidden = self.encoder(input_ids)
        
        decoder_outputs= []
        for i in range(seq_len):
            decoder_output, decoder_hidden =self.decoder(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)
            decoder_input = tgt_output[:,i].unsqueeze(1)
        
        #loss
        logits = torch.cat(decoder_outputs, dim=1)
        loss = self.loss_fn(logits.view(-1, logits.shape[-1]), tgt_output.view(-1))
        return {'loss': loss, 'logits': logits}
        
        

In [13]:
config=  Seq2SeqRNNConfig(
    vocab_size_src= len(tokenizer_en), vocab_size_tgt=len(tokenizer_vi)
)
model = Seq2SeqRNNModel(config, tokenizer_en)

In [16]:
from transformers import Trainer, TrainingArguments

train_args = TrainingArguments(
    output_dir='./en-vi-translation',
    logging_dir ='logs',
    eval_strategy='epoch',
    save_strategy ='epoch',
    logging_strategy='epoch',
    per_device_train_batch_size=512,
    per_device_eval_batch_size= 512,
    num_train_epochs=5,
    learning_rate=2e-5,
    save_total_limit=1
)

trainer =Trainer(
    model=model,
    args= train_args,
    train_dataset= preprocessed_ds['train'],
    eval_dataset = preprocessed_ds['validation']
)

In [None]:
def generate_square_subsequent_mask(sz, device):
    mask = (torch.triu(torch,torch.ones((sz,sz)),device=device)== 1).transpose(0,1)
    mask = mask.float().masked_fill(mask==0, float('-inf')).masked_fill(mask ==1, float(0,0))
    return mask

def create_mask(src, tgt):
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]
    device= src.device
    
    tgt_mask= generate_square_subsequent_mask(tgt_seq_len, device).to(torch.bool)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device= device).type(torch.bool)
    src_padding_mask = (src==0)
    tgt_padding_mask = (tgt ==0)
    return src_mask, tgt_mask , src_padding_mask, tgt_padding_mask

from transformers import PreTrainedModel, PretrainedConfig

class Seq2SeqTransformerConfig(PretrainedConfig):
    def __init__(self, vocab_size_src=10000, vocab_size_tgt=10000, max_seq_length=50, d_model=256,
                 num_heads=8, num_layers=6,
                 dropout=0.1
             ,    **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_src= vocab_size_src
        self.vocab_size_tgt= vocab_size_tgt
        self.max_seq_length = max_seq_length
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout = dropout
    
    
class Seq2SeqTransformerModel(PreTrainedModel):
    config_class = Seq2SeqTransformerConfig
    
    def __init__(self, config):
        super().__init__(config)
        
        self.embedding_src = nn.Embedding(
            config.vocab_size_src, config.d_model
        )
        self.embedding_tgt = nn.Embedding(
            config.vocab_size_tgt, config.d_model
        )
        self.position_embedding_src = nn.Embedding(
            config.max_seq_length, config.d_model
        )
        self.position_embedding_tgt = nn.Embedding(
            config.max_seq_length, config.d_model
        )
        
        self.transformer = nn.Transformer(
            d_model= config.d_model,
            nhead= config.num_heads,
            num_encoder_layers = config.num_layers,
            num_decoder_layers = config.num_layers,
            dropout= config.dropout,
            batch_first=True
        )
        
        self.generator= nn.Linear(
            config.d_model, config.vocav_size_tgt
        )

        self.losss_fn= nn.CrossEntropyLoss(ignore_index=0)
        
    def forward(self, input_ids, labels):
        tgt_input = labels[:,:-1]
        tgt_output = labels[:,1:]
        batch_size, seq_len_src = input_ids.shape
        _, seq_len_tgt = tgt_input.shape
        
        src_positions = torch.arange(seq_len_src, device = input_ids.device).unsqueeze(0)
        tgt_positions = torch.arange(seq_len_tgt, device = input_ids.device).unsqueeze(0)
        
        src_embedded = self.embedding_src(input_ids) + self.position_embedding_src(src_positions)
        tgt_embedded = self.embedding_tgt(tgt_input) +self.position_embedding_tgt(tgt_positions)
        
        src_mask, tgt_mask, src_leu_padding_mask, tgt_key_padding_mask= create_mask(input_ids, tgt_input)
        
        
        outs = self.transformer(
            src_embedded, tgt_embedded, src_mask, tgt_mask,
            src_key_padding_mask= src_leu_padding_mask, tgt_key_padding_mask= tgt_key_padding_mask
            )
        logits =self.generator(outs)
        loss =self.loss_fn(logits.permute(0,2,1),tgt_output)
        
        return {
            'loss': loss,
            'logits': logits
        }
    
    def encode(self,src, src_mask):
        batch_size, seq_len_src = src.shape
        src_positions = torch.arange(seq_len_src, device= 'cuda').unsqueeze(0)
        src_embedded = self.embedding_src(src) + self.position_embedding_src(src_positions)
        return self.transformer.encoder(src_embedded, src_mask)
    
    def decode(self, tgt, encoder_output, tgt_mask):
        _, seq_len_tgt = tgt.shape
        tgt_positions = torch.arange(seq_len_tgt, device= 'cuda').unsqueeze(0)
        tgt_embedded = self.embedding_tgt(tgt) + self.position_embedding_src(tgt_positions)        
        return self.transformer.decoder(tgt_embedded, encoder_output, tgt_mask)

In [None]:
config = Seq2SeqTransformerConfig(
    vocab_size_src = len(tokenizer_en), vocab_size_tgt = len(tokenizer_vi), max_seq_length= 50
)

model =Seq2SeqTransformerModel(config)

In [None]:
from transformers import Trainer, TrainingArguments

train_args = TrainingArguments(
    output_dir='./en-vi-translation',
    logging_dir ='logs',
    eval_strategy='epoch',
    save_strategy ='epoch',
    logging_strategy='epoch',
    per_device_train_batch_size=512,
    per_device_eval_batch_size= 512,
    num_train_epochs=5,
    learning_rate=2e-5,
    save_tootal_limit=1
)

trainer =Trainer(
    model=model,
    args= train_args,
    train_datset= preprocessed_ds['train'],
    eval_dataset = preprocessed_ds['validation']
)

# Practice

In [1]:
from datasets import load_dataset
from tokenizers import pre_tokenizers, Tokenizer, trainers, models
import torch
import torch.nn as nn
from transformers import PreTrainedModel,  PretrainedConfig
from transformers import PreTrainedTokenizerFast

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
ds = load_dataset('thainq107/iwslt2015-en-vi')

Using the latest cached version of the dataset since thainq107/iwslt2015-en-vi couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at C:\Users\pikke\.cache\huggingface\datasets\thainq107___iwslt2015-en-vi\default\0.0.0\4abfde30435395d5d3f031fea5e3f6600543521e (last modified on Fri Mar 21 16:21:56 2025).


In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 133317
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 1268
    })
})

In [5]:
ds['train']

Dataset({
    features: ['en', 'vi'],
    num_rows: 133317
})

In [4]:
tokenizer_vi = Tokenizer(models.WordLevel( unk_token='<unk>'))
tokenizer_en = Tokenizer(models.WordLevel( unk_token='<unk>'))

tokenizer_vi.pre_tokenizer= pre_tokenizers.Whitespace()
tokenizer_en.pre_tokenizer = pre_tokenizers.Whitespace()

trainer = trainers.WordLevelTrainer(vocab_size = 10000, min_frequency = 2, special_tokens= ['<pad>','<unk>','<bos>','<eos']) 
# If you set min_frequency=5, then any token that appears less than 5 times in the dataset will be excluded from the vocabulary.

tokenizer_vi.train_from_iterator(ds['train']['vi'],trainer)
tokenizer_en.train_from_iterator(ds['train']['en'],trainer)

# tokenizer_vi.save('tokenizer_vi.json')
# tokenizer_en.save('tokenizer_en.json')

In [5]:
tokenizer_vi = PreTrainedTokenizerFast(tokenizer_file = 'tokenizer_en.json' , bos_token='<bos>', eos='<eos>', unk_token='<unk>', pad_token='<pad>')
tokenizer_en = PreTrainedTokenizerFast(tokenizer_file = 'tokenizer_vi.json' , bos_token='<bos>', eos='<eos>', unk_token='<unk>', pad_token='<pad>')

In [6]:
max_len=100

In [7]:
def preprocessing(examples):
    source_text = examples['en']
    target_text = ['<bos>'+ txt + '<eos>' for txt in examples['vi']]
    
    source_input = tokenizer_en(source_text, truncation=True, padding='max_length', max_length = max_len)['input_ids']
    target=  tokenizer_vi(target_text, truncation=True, padding= 'max_length',  max_length= max_len)['input_ids']
    
    return {
        'input_ids': source_input,
        'labels': target
            }

In [8]:
preprocessed_ds =ds.map(preprocessing, batched=True)

Map:   0%|          | 0/133317 [00:00<?, ? examples/s]

Map: 100%|██████████| 133317/133317 [00:16<00:00, 8143.64 examples/s]
Map: 100%|██████████| 1268/1268 [00:00<00:00, 7598.72 examples/s]
Map: 100%|██████████| 1268/1268 [00:00<00:00, 6871.77 examples/s]


In [9]:
train_ds = preprocessed_ds['train']
val_ds = preprocessed_ds['validation']
test_ds = preprocessed_ds['test']

In [11]:
min([len(input) for input in train_ds['input_ids']])

100

In [12]:
class ModelConfig(PretrainedConfig):
    def __init__(self,vocab_size_source=10000, vocab_size_target=10000, embedding_dim=128, hidden_size=256, num_layers=4, **kwargs):
        super().__init__(**kwargs)
        self.vocab_size_source = vocab_size_source
        self.vocab_size_target= vocab_size_target
        self.embedding_dim = embedding_dim
        self.hidden_size= hidden_size
        self.num_layers = num_layers
        

class Encoder(nn.Module):
    def __init__(self, output_size, embedding_dim ,hidden_size,  num_layers, dropout_p=0.1):
        super().__init__()
        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim,hidden_size, num_layers,batch_first=True, bidirectional=True)
        self.fc = nn.Sequential(
            nn.Dropout(dropout_p),
            nn.Linear(hidden_size,256),
            nn.LeakyReLU(),
            nn.Linear(256,output_size)
        )   
        
    
    def forward(self, input):
        embeddings = self.embedding(input)
        outputs, hidden =  self.gru(embeddings)
        hidden= hidden[-1,:,:] #  LxNxC
        hidden= hidden.squeeze(0) # NxC
        output = self.fc(hidden)
        
        return output
    

class Decoder(nn.Module):
    def __init__(self,output_size, embedding_dim, hidden_size, num_layers, dropout_p =0.2):
        super().__init__()
        self.embedding = nn.Embedding(output_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim,hidden_size, num_layers,batch_first = True)
        self.fc = nn.Sequential(
            nn.Dropout(dropout_p),
            nn.Linear(hidden_size,256),
            nn.LeakyReLU(),
            nn.Linear(256,output_size)
        )   
        
    def forward(self, input, hidden):
        embeddings = self.embedding(input)
        outputs, hidden =  self.rnn(embeddings,hidden)
        outputs = self.fc(outputs)
        return outputs, hidden

class Seq2SeqModel(PreTrainedModel):
    config_class = ModelConfig
    
    def __init__(self, config, tokenizer_en):
        super().__init__(config)
        self.encoder = Encoder(config.vocab_size_source,config.embedding_dim,config.hidden_size, config.num_layers,0.2)
        self.decoder = Decoder(config.vocab_size_target,config.embedding_dim, config.hidden_size, config.num_layers, 0.2)
        self.loss_fn= nn.CrossEntropyLoss()
        self.bos_idx = tokenizer_en.convert_tokens_to_ids('<bos>')
        
    def forward(self, input_ids, labels):
        inputs = labels[:,:-1]
        label = labels[:,1:]
        
        abstract_feature = self.encoder(input_ids) # NxC
        
        batch_size, seq_len = labels.shape
        outputs=[]
        input_decoder = torch.full((batch_size,1), self.bos_idx,device= torch.device('cuda'))
        
        for i in range(seq_len):
            
            output, hidden = self.decoder(input_decoder,abstract_feature)
            outputs.append(output)
            input_decoder = input[:,i].unsqueeze(1)
            abstract_feature=hidden
            
        logits = torch.cat(outputs, dim=1)
        loss = self.loss_fn(logits.view(-1, logits.shape[-1]), label.view(-1))
        return {'loss': loss, 'logits': logits}
            
    
        
        
    
        

In [13]:
config = ModelConfig(10000,10000,128,256,4)
model =Seq2SeqModel(config,tokenizer_en)


In [14]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir='./translation-model',
    save_strategy ='epoch',
    eval_strategy='epoch',
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    learning_rate=1e-5,
    weight_decay=0.2,
    num_train_epochs=2,
    remove_unused_columns=False
    )

trainer = Trainer(model, args, train_dataset=train_ds,eval_dataset=val_ds)

In [15]:
trainer.train()

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
