In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
import re
from typing import Iterable
from torchtext.data import get_tokenizer
import underthesea
from torchtext.vocab import build_vocab_from_iterator

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
df = pd.read_csv('Sentence pairs in English-Vietnamese - 2025-11-12.tsv', sep='\t')
df.columns = ['eng_id', 'english', 'vie_id', 'vietnamese']
df = df[['english', 'vietnamese']]
df.head()

Unnamed: 0,english,vietnamese
0,Today is June 18th and it is Muiriel's birthday!,"Hôm nay là ngày 18 tháng sáu, và cũng là ngày ..."
1,Muiriel is 20 now.,Bây giờ Muiriel được 20 tuổi.
2,"The password is ""Muiriel"".","Mật mã là ""Muiriel""."
3,I'm at a loss for words.,Tôi hết lời để nói.
4,I'm at a loss for words.,Tôi không biết nói gì.


In [None]:
df['eng_tokens'] = df['english'].apply(nltk.word_tokenize) 
df['vie_tokens'] = df['vietnamese'].apply(underthesea.word_tokenize) 
df.head()

Please install SpaCy. See the docs at https://spacy.io for more information.


ModuleNotFoundError: No module named 'spacy'

In [None]:
SRC_LANGUAGE = 'en'
TGT_LANGUAGE = 'vi'

def data_iterator(df) -> Iterable:
    for _, row in df.iterrows():
        yield (row['english'], row['vietnamese'])


In [None]:
token_transform = {}
token_transform[SRC_LANGUAGE] = get_tokenizer('spacy', language='en_core_web_sm')
token_transform[TGT_LANGUAGE] = lambda x: underthesea.word_tokenize(x, format='text').split()

In [None]:
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ['<unk>', '<pad>', '<bos>', '<eos>']

vocab_transform = {}

def yield_tokens(data_iter: Iterable, language: str):
    language_index = {SRC_LANGUAGE: 0, TGT_LANGUAGE: 1}
    for data_sample in data_iter:
        yield token_transform[language](data_sample[language_index[language]])

for ln in [SRC_LANGUAGE, TGT_LANGUAGE]:
    vocab_transform[ln] = build_vocab_from_iterator(
        yield_tokens(data_iterator(df), ln),
        min_freq=1,
        specials=special_symbols,
        special_first=True
    )
    vocab_transform[ln].set_default_index(UNK_IDX)