In [64]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [65]:
df = pd.read_csv('Sentence pairs in English-Vietnamese - 2025-11-12.tsv', sep='\t')
df.columns = ["id_en", "en", "id_vi", "vi"]
df.head()

Unnamed: 0,id_en,en,id_vi,vi
0,1280,Today is June 18th and it is Muiriel's birthday!,5665,"H√¥m nay l√† ng√†y 18 th√°ng s√°u, v√† c≈©ng l√† ng√†y ..."
1,1282,Muiriel is 20 now.,5667,B√¢y gi·ªù Muiriel ƒë∆∞·ª£c 20 tu·ªïi.
2,1283,"The password is ""Muiriel"".",5668,"M·∫≠t m√£ l√† ""Muiriel""."
3,1286,I'm at a loss for words.,5671,T√¥i h·∫øt l·ªùi ƒë·ªÉ n√≥i.
4,1286,I'm at a loss for words.,3481583,T√¥i kh√¥ng bi·∫øt n√≥i g√¨.


In [66]:
df['en_word_count'] = df['en'].str.split().str.len()
df['en_word_count'].max()


np.int64(95)

In [67]:
df['vi_word_count'] = df['vi'].str.split().str.len()
df['vi_word_count'].max()

np.int64(122)

In [68]:
SOS_token = 0
EOS_token = 1

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [69]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )
import regex as re
# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = s.lower().strip()

    # Replace special punctuation forms
    s = s.replace("‚Ä¶", ".")
    s = s.replace("‚Äú", '"').replace("‚Äù", '"').replace("‚Äô", "'")

    # Add space around .?! to tokenize clearly
    s = re.sub(r"([.!?])", r" \1 ", s)

    # Keep letters (any language), numbers, and .?!
    s = re.sub(r"[^\p{L}\p{N}.!?']+", " ", s)

    # Collapse multiple spaces
    s = re.sub(r"\s+", " ", s).strip()

    return s

In [70]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    filepath = "Sentence pairs in English-Vietnamese - 2025-11-12.tsv"

    lines = open(filepath, encoding='utf-8').read().strip().split("\n")

    pairs = []
    for l in lines:
        parts = l.split("\t")
        if len(parts) < 4:
            continue
        eng = normalizeString(parts[1])
        vie = normalizeString(parts[3])
        pairs.append([eng, vie])

    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs


In [71]:
MAX_LENGTH = 123
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))

    # REMOVE filtering if you want full dataset
    # pairs = filterPairs(pairs)
    # print("Trimmed to %s sentence pairs" % len(pairs))

    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])

    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)

    return input_lang, output_lang, pairs

In [72]:
input_lang, output_lang, pairs = prepareData('eng', 'vie', False)
print(random.choice(pairs))


Reading lines...
Read 18580 sentence pairs
Counting words...
Counted words:
eng 7493
vie 3860
['it goes without saying that tom is in love with kathy .', 'r√µ r√†ng l√† tom ƒëang y√™u kathy .']


In [73]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

In [74]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [75]:
class BahdanauAttention(nn.Module):
    def __init__(self, hidden_size):
        super(BahdanauAttention, self).__init__()
        self.Wa = nn.Linear(hidden_size, hidden_size)
        self.Ua = nn.Linear(hidden_size, hidden_size)
        self.Va = nn.Linear(hidden_size, 1)

    def forward(self, query, keys):
        scores = self.Va(torch.tanh(self.Wa(query) + self.Ua(keys)))
        scores = scores.squeeze(2).unsqueeze(1)

        weights = F.softmax(scores, dim=-1)
        context = torch.bmm(weights, keys)

        return context, weights

class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.attention = BahdanauAttention(hidden_size)
        self.gru = nn.GRU(2 * hidden_size, hidden_size, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []
        attentions = []

        for i in range(MAX_LENGTH):
            decoder_output, decoder_hidden, attn_weights = self.forward_step(
                decoder_input, decoder_hidden, encoder_outputs
            )
            decoder_outputs.append(decoder_output)
            attentions.append(attn_weights)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        attentions = torch.cat(attentions, dim=1)

        return decoder_outputs, decoder_hidden, attentions


    def forward_step(self, input, hidden, encoder_outputs):
        embedded =  self.dropout(self.embedding(input))

        query = hidden.permute(1, 0, 2)
        context, attn_weights = self.attention(query, encoder_outputs)
        input_gru = torch.cat((embedded, context), dim=2)

        output, hidden = self.gru(input_gru, hidden)
        output = self.out(output)

        return output, hidden, attn_weights

In [76]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

# def get_dataloader(batch_size):
#     input_lang, output_lang, pairs = prepareData('eng', 'vie', False)

#     n = len(pairs)
    
#     input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
#     target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

#     for idx, (inp, tgt) in enumerate(pairs):
#         inp_ids = indexesFromSentence(input_lang, inp)
#         tgt_ids = indexesFromSentence(output_lang, tgt)
#         inp_ids.append(EOS_token)
#         tgt_ids.append(EOS_token)
#         input_ids[idx, :len(inp_ids)] = inp_ids
#         target_ids[idx, :len(tgt_ids)] = tgt_ids

#     train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
#                                torch.LongTensor(target_ids).to(device))

#     train_sampler = RandomSampler(train_data)
#     train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
#     return input_lang, output_lang, train_dataloader
def get_dataloader(batch_size):
    input_lang, output_lang, pairs = prepareData('eng', 'vie', False)

    n = len(pairs)

    # allocate padded arrays
    input_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)
    target_ids = np.zeros((n, MAX_LENGTH), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        # convert to index list
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)

        # üö® REQUIRED: truncate BEFORE adding EOS
        inp_ids = inp_ids[:MAX_LENGTH - 1]
        tgt_ids = tgt_ids[:MAX_LENGTH - 1]

        # append EOS token
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)

        # fill padded arrays (now guaranteed to fit)
        input_ids[idx, :len(inp_ids)] = inp_ids
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    # create tensor dataset
    train_data = TensorDataset(
        torch.LongTensor(input_ids).to(device),
        torch.LongTensor(target_ids).to(device)
    )

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    return input_lang, output_lang, pairs, train_dataloader


In [77]:
def train_epoch(dataloader, encoder, decoder, encoder_optimizer,
          decoder_optimizer, criterion):

    total_loss = 0
    for data in dataloader:
        input_tensor, target_tensor = data

        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, _, _ = decoder(encoder_outputs, encoder_hidden, target_tensor)

        loss = criterion(
            decoder_outputs.view(-1, decoder_outputs.size(-1)),
            target_tensor.view(-1)
        )
        loss.backward()

        encoder_optimizer.step()
        decoder_optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [78]:
import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

In [79]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

In [80]:
def train(train_dataloader, encoder, decoder, n_epochs, learning_rate=0.001,
               print_every=100, plot_every=100):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()

    for epoch in range(1, n_epochs + 1):
        loss = train_epoch(train_dataloader, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, epoch / n_epochs),
                                        epoch, epoch / n_epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)

In [81]:
def evaluate(encoder, decoder, sentence, input_lang, output_lang):
    with torch.no_grad():
        input_tensor = tensorFromSentence(input_lang, sentence)

        encoder_outputs, encoder_hidden = encoder(input_tensor)
        decoder_outputs, decoder_hidden, decoder_attn = decoder(encoder_outputs, encoder_hidden)

        _, topi = decoder_outputs.topk(1)
        decoded_ids = topi.squeeze()

        decoded_words = []
        for idx in decoded_ids:
            if idx.item() == EOS_token:
                decoded_words.append('<EOS>')
                break
            decoded_words.append(output_lang.index2word[idx.item()])
    return decoded_words, decoder_attn

In [84]:
hidden_size = 128
batch_size = 32

input_lang, output_lang, pairs, train_dataloader = get_dataloader(batch_size)

encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
decoder = AttnDecoderRNN(hidden_size, output_lang.n_words).to(device)

train(train_dataloader, encoder, decoder, 80, print_every=5, plot_every=5)

Reading lines...
Read 18580 sentence pairs
Counting words...
Counted words:
eng 7493
vie 3860
51m 26s (- 771m 40s) (5 6%) 0.3795
101m 10s (- 708m 14s) (10 12%) 0.2362
150m 32s (- 652m 20s) (15 18%) 0.1756
199m 44s (- 599m 14s) (20 25%) 0.1409
256m 52s (- 565m 7s) (25 31%) 0.1188
307m 11s (- 511m 59s) (30 37%) 0.1029
366m 35s (- 471m 20s) (35 43%) 0.0911
421m 28s (- 421m 28s) (40 50%) 0.0817
487m 53s (- 379m 28s) (45 56%) 0.0743
529m 48s (- 317m 53s) (50 62%) 0.0682
573m 13s (- 260m 33s) (55 68%) 0.0630
630m 54s (- 210m 18s) (60 75%) 0.0586
691m 19s (- 159m 32s) (65 81%) 0.0549
738m 40s (- 105m 31s) (70 87%) 0.0516
782m 48s (- 52m 11s) (75 93%) 0.0488
838m 6s (- 0m 0s) (80 100%) 0.0463


In [None]:
def evaluateRandomly(encoder, decoder, n=10):
    for i in range(n):
        pair = random.choice(pairs)
        print('>', pair[0])
        print('=', pair[1])
        output_words, _ = evaluate(encoder, decoder, pair[0], input_lang, output_lang)
        output_sentence = ' '.join(output_words)
        print('<', output_sentence)
        print('')

In [None]:
encoder.eval()
decoder.eval()
evaluateRandomly(encoder, decoder)

> she cried throughout the night
= co ay a khoc suot em
< ban a thay ban ban ban toi a thay ban <EOS>

> he takes care of his appearance
= anh ay cham chut ve be ngoai cua minh
< anh ay la anh ay la anh ay cua anh ay thich cua anh ay <EOS>

> i m good with people
= toi co kha nang tuong tac xa hoi tot
< ban va noi voi nguoi nghi khi ban va noi voi nguoi <EOS>

> this book makes pleasant reading
= quyen sach nay oc that thu vi
< ong vat va muon nen lam viec noi cua ban <EOS>

> what makes you think we won t succeed ?
= ieu gi khien ban nghi rang chung ta se khong thanh cong ?
< ban se nghi se se khong muon lam ieu o se khong ? <EOS>

> yesterday i translated a video and composed subtitles in esperanto and spanish
= hom qua toi a dich va viet phu e cho mot video bang tieng esperanto va tieng tay ban nha
< mot nguoi va va se va va se va va va se va va va se va va va se mot nguoi phap va va se ngu <EOS>

> he is not a bad person
= anh ta khong phai la nguoi xau
< mot nguoi muon song <EOS>
