<a href="https://colab.research.google.com/github/Sirisap22/neural-machine-translation-en2th/blob/main/temp_NMT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Essential Libraries

In [209]:
#!pip install torchtext==0.9.0
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, BucketIterator, Dataset, Example, TabularDataset
import numpy as np
import pandas as pd
import spacy, random

In [1]:
# tokenizer for thai language
def installPyThaiNLP():
    !pip install https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
    !pip install epitran
    !pip install sklearn_crfsuite
installPyThaiNLP()

Collecting https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
[?25l  Downloading https://github.com/PyThaiNLP/pythainlp/archive/dev.zip
[K     - 12.7MB 126kB/s
[?25hCollecting python-crfsuite>=0.9.6
[?25l  Downloading https://files.pythonhosted.org/packages/79/47/58f16c46506139f17de4630dbcfb877ce41a6355a1bbf3c443edb9708429/python_crfsuite-0.9.7-cp37-cp37m-manylinux1_x86_64.whl (743kB)
[K     |████████████████████████████████| 747kB 18.8MB/s 
Collecting tinydb>=3.0
  Downloading https://files.pythonhosted.org/packages/af/cd/1ce3d93818cdeda0446b8033d21e5f32daeb3a866bbafd878a9a62058a9c/tinydb-4.4.0-py3-none-any.whl
Building wheels for collected packages: pythainlp
  Building wheel for pythainlp (setup.py) ... [?25l[?25hdone
  Created wheel for pythainlp: filename=pythainlp-2.3.0b1-cp37-none-any.whl size=11006868 sha256=2e8f60e2d593a03c58577b6963dbbbc73ec08c8082b51068d01dc40629d79196
  Stored in directory: /tmp/pip-ephem-wheel-cache-onr4hjfw/wheels/79/4e/1e/26f3198c6712ecfbee929

In [5]:
def runOnceNltk():
    import nltk
    nltk.download('punkt')
runOnceNltk()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [180]:
from pythainlp import word_tokenize as tokenize_thai
from nltk.tokenize import word_tokenize as tokenize_eng
def tokenizeThai(s):
    return tokenize_thai(s, keep_whitespace=False)
def tokenizeEng(s):
    return tokenize_eng(s)


In [181]:
# import data

df = pd.read_csv('generated_reviews_yn.csv')
df.head()

Unnamed: 0,en_text,th_text
0,We are trying to use them on our Samsung Smart...,เรากำลังพยายามใช้พวกเขาบน Samsung Smart TV ของเรา
1,This thing will not work with Mac OSX 10.4.7.1.,สิ่งนี้จะไม่ทำงานกับ Mac OSX 10.4.7.1
2,We are very happy with our Dyson DC25.,เรามีความสุขมากกับ Dyson DC25 ของเรา
3,It doesn't work with Skype.,มันไม่ทำงานกับ Skype
4,I'll be looking at Cisco next.,ฉันจะดู Cisco ต่อไป


In [182]:
encode = ''
with open('thai_websites.csv') as f:
   encode = f.encoding
df2 = pd.read_csv('thai_websites.csv', encoding=encode)
df2.head()

Unnamed: 0,en_text,th_text
0,CLARINS This intensive replenishing cream hel...,คืนความยืดหยุ่นให้ผิวที่ร่วงโรยตามวัย จากการเป...
1,ONLY@CENTRAL Color : Almond Size : 32 A UK Ca...,ONLY@CENTRAL สี : เบจไซส์ : 32 A UK หมายเหตุ:...
2,ONLY@CENTRAL Color : Almond Size : 32 B UK Ca...,ONLY@CENTRAL สี : เบจไซส์ : 32 B UK หมายเหตุ:...
3,ONLY@CENTRAL Color : Almond Size : 32 C UK Ca...,ONLY@CENTRAL สี : เบจไซส์ : 32 C UK หมายเหตุ:...
4,ONLY@CENTRAL Color : Almond Size : 34 A UK Ca...,ONLY@CENTRAL สี : เบจไซส์ : 34 A UK หมายเหตุ:...


In [184]:
thai = Field(tokenize=tokenizeThai , init_token="<sos>", eos_token="<eos>")
english = Field(tokenize=tokenizeEng, lower=True, init_token="<sos>", eos_token="<eos>")

In [211]:
fields = [('en_text', english), ('th_text', thai)]
train_data, valid_data = TabularDataset.splits(
    path = "./",
    train = 'generated_reviews_yn.csv',
    test = 'thai_websites.csv',
    format = 'csv', 
    fields = fields
)

In [186]:
train_data =  DataFrameDataset(df, {'en_text': english, 'th_text': thai})
valid_data = DataFrameDataset(df2, {'en_text': english, 'th_text': thai})

In [212]:
train_data, valid_data, len(train_data), len(valid_data)

(<torchtext.legacy.data.dataset.TabularDataset at 0x7ff3113f0fd0>,
 <torchtext.legacy.data.dataset.TabularDataset at 0x7ff3113f0390>,
 280209,
 120281)

In [213]:
thai.build_vocab(train_data, max_size=10000, min_freq=3)
english.build_vocab(train_data, max_size=10000, min_freq=3)

In [214]:
print(f'Unique tokens in source (en) vocabulary: {len(english.vocab)}')
print(f'Unique tokens in target (th) vocabulary: {len(thai.vocab)}')

Unique tokens in source (en) vocabulary: 10004
Unique tokens in target (th) vocabulary: 10004


In [197]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32

In [224]:
train_iterator, valid_iterator = BucketIterator.splits((train_data, valid_data), 
                                                                      batch_size = BATCH_SIZE, 
                                                                      sort_within_batch=True,
                                                                      sort_key=lambda x: len(x.en_text),
                                                                      device = device)

In [217]:
max_len_thai = []
max_len_eng = []
count = 0
for src, trg in zip(train_data.en_text, train_data.th_text):
  max_len_thai.append(len(src))
  max_len_eng.append(len(trg))
  if count < 10 :
    print("English - ",src, " Length - ", len(src))
    print("thai - ",trg, " Length - ", len(trg))
    print()
  count += 1
print("Maximum Length of English Sentence {} and Thai Sentence {} in the dataset".format(max(max_len_eng),max(max_len_thai)))
print("Minimum Length of English Sentence {} and Thai Sentence {} in the dataset".format(min(max_len_eng),min(max_len_thai)))

English -  ['en_text']  Length -  1
thai -  ['th', '_', 'text']  Length -  3

English -  ['we', 'are', 'trying', 'to', 'use', 'them', 'on', 'our', 'samsung', 'smart', 'tv', '.']  Length -  12
thai -  ['เรา', 'กำลัง', 'พยายาม', 'ใช้', 'พวกเขา', 'บน', 'Samsung', 'Smart', 'TV', 'ของ', 'เรา']  Length -  11

English -  ['this', 'thing', 'will', 'not', 'work', 'with', 'mac', 'osx', '10.4.7.1', '.']  Length -  10
thai -  ['สิ่ง', 'นี้', 'จะ', 'ไม่', 'ทำงาน', 'กับ', 'Mac', 'OSX', '10.4.7.1']  Length -  9

English -  ['we', 'are', 'very', 'happy', 'with', 'our', 'dyson', 'dc25', '.']  Length -  9
thai -  ['เรา', 'มีความสุข', 'มาก', 'กับ', 'Dyson', 'DC', '25', 'ของ', 'เรา']  Length -  9

English -  ['it', 'does', "n't", 'work', 'with', 'skype', '.']  Length -  7
thai -  ['มัน', 'ไม่', 'ทำงาน', 'กับ', 'Skype']  Length -  5

English -  ['i', "'ll", 'be', 'looking', 'at', 'cisco', 'next', '.']  Length -  8
thai -  ['ฉัน', 'จะ', 'ดู', 'Cisco', 'ต่อไป']  Length -  5

English -  ['however', ',', 'i', 

In [218]:
class EncoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
    super(EncoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    self.input_size = input_size

    # Output size of the word embedding NN
    self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Regularization parameter
    self.dropout = nn.Dropout(p)
    self.tag = True

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(self.input_size, self.embedding_size)
    
    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(self.embedding_size, hidden_size, num_layers, dropout = p)

  # Shape of x (26, 32) [Sequence_length, batch_size]
  def forward(self, x):

    # Shape -----------> (26, 32, 300) [Sequence_length , batch_size , embedding dims]
    embedding = self.dropout(self.embedding(x))
    
    # Shape --> outputs (26, 32, 1024) [Sequence_length , batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size]
    outputs, (hidden_state, cell_state) = self.LSTM(embedding)

    return hidden_state, cell_state

input_size_encoder = len(english.vocab)
encoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
encoder_dropout = float(0.5)

encoder_lstm = EncoderLSTM(input_size_encoder, encoder_embedding_size,
                           hidden_size, num_layers, encoder_dropout).to(device)
print(encoder_lstm)

EncoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(10004, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
)


In [219]:
class DecoderLSTM(nn.Module):
  def __init__(self, input_size, embedding_size, hidden_size, num_layers, p, output_size):
    super(DecoderLSTM, self).__init__()

    # Size of the one hot vectors that will be the input to the encoder
    self.input_size = input_size

    # Output size of the word embedding NN
    self.embedding_size = embedding_size

    # Dimension of the NN's inside the lstm cell/ (hs,cs)'s dimension.
    self.hidden_size = hidden_size

    # Number of layers in the lstm
    self.num_layers = num_layers

    # Size of the one hot vectors that will be the output to the encoder (English Vocab Size)
    self.output_size = output_size

    # Regularization parameter
    self.dropout = nn.Dropout(p)
    self.tag = True

    # Shape --------------------> (5376, 300) [input size, embedding dims]
    self.embedding = nn.Embedding(self.input_size, self.embedding_size)

    # Shape -----------> (300, 2, 1024) [embedding dims, hidden size, num layers]
    self.LSTM = nn.LSTM(self.embedding_size, hidden_size, num_layers, dropout = p)

    # Shape -----------> (1024, 4556) [embedding dims, hidden size, num layers]
    self.fc = nn.Linear(self.hidden_size, self.output_size)

  # Shape of x (32) [batch_size]
  def forward(self, x, hidden_state, cell_state):

    # Shape of x (1, 32) [1, batch_size]
    x = x.unsqueeze(0)

    # Shape -----------> (1, 32, 300) [1, batch_size, embedding dims]
    embedding = self.dropout(self.embedding(x))

    # Shape --> outputs (1, 32, 1024) [1, batch_size , hidden_size]
    # Shape --> (hs, cs) (2, 32, 1024) , (2, 32, 1024) [num_layers, batch_size size, hidden_size] (passing encoder's hs, cs - context vectors)
    outputs, (hidden_state, cell_state) = self.LSTM(embedding, (hidden_state, cell_state))

    # Shape --> predictions (1, 32, 4556) [ 1, batch_size , output_size]
    predictions = self.fc(outputs)

    # Shape --> predictions (32, 4556) [batch_size , output_size]
    predictions = predictions.squeeze(0)

    return predictions, hidden_state, cell_state

input_size_decoder = len(thai.vocab)
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
decoder_dropout = float(0.5)
output_size = len(thai.vocab)

decoder_lstm = DecoderLSTM(input_size_decoder, decoder_embedding_size,
                           hidden_size, num_layers, decoder_dropout, output_size).to(device)
print(decoder_lstm)

DecoderLSTM(
  (dropout): Dropout(p=0.5, inplace=False)
  (embedding): Embedding(10004, 300)
  (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  (fc): Linear(in_features=1024, out_features=10004, bias=True)
)


In [220]:
from torch.utils.tensorboard import SummaryWriter
from torchsummary import summary

In [221]:
class Seq2Seq(nn.Module):
  def __init__(self, Encoder_LSTM, Decoder_LSTM):
    super(Seq2Seq, self).__init__()
    self.Encoder_LSTM = Encoder_LSTM
    self.Decoder_LSTM = Decoder_LSTM

  def forward(self, source, target, tfr=0.5):
    # Shape - Source : (10, 32) [(Sentence length English + some padding), Number of Sentences]
    batch_size = source.shape[1]

    # Shape - Source : (14, 32) [(Sentence length Thai + some padding), Number of Sentences]
    target_len = target.shape[0]
    target_vocab_size = len(thai.vocab)
    
    # Shape --> outputs (14, 32, 5766) 
    outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

    # Shape --> (hs, cs) (2, 32, 1024) ,(2, 32, 1024) [num_layers, batch_size size, hidden_size] (contains encoder's hs, cs - context vectors)
    hidden_state_encoder, cell_state_encoder = self.Encoder_LSTM(source)

    # Shape of x (32 elements)
    x = target[0] # Trigger token <SOS>

    for i in range(1, target_len):
      # Shape --> output (32, 5766) 
      output, hidden_state_decoder, cell_state_decoder = self.Decoder_LSTM(x, hidden_state_encoder, cell_state_encoder)
      outputs[i] = output
      best_guess = output.argmax(1) # 0th dimension is batch size, 1st dimension is word embedding
      x = target[i] if random.random() < tfr else best_guess # Either pass the next word correctly from the dataset or use the earlier predicted word

    # Shape --> outputs (14, 32, 5766) 
    return outputs

learning_rate = 0.001
writer = SummaryWriter(f"runs/loss_plot")
step = 0

model = Seq2Seq(encoder_lstm, decoder_lstm).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = thai.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

print(model)

Seq2Seq(
  (Encoder_LSTM): EncoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(10004, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
  )
  (Decoder_LSTM): DecoderLSTM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embedding): Embedding(10004, 300)
    (LSTM): LSTM(300, 1024, num_layers=2, dropout=0.5)
    (fc): Linear(in_features=1024, out_features=10004, bias=True)
  )
)


In [222]:

def translate_sentence(model, sentence, english, thai, device, max_length=50):

    if type(sentence) == str:
        tokens = tokenizeEng(sentence)
    else:
        tokens = [token.lower() for token in sentence]
    tokens.insert(0, english.init_token)
    tokens.append(english.eos_token)
    text_to_indices = [english.vocab.stoi[token] for token in tokens]
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.Encoder_LSTM(sentence_tensor)

    outputs = [thai.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.Decoder_LSTM(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == thai.vocab.stoi["<eos>"]:
            break

    translated_sentence = [thai.vocab.itos[idx] for idx in outputs]
    return translated_sentence[1:]

In [233]:
def bleu(data, model, english, thai, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["en_text"]
        trg = vars(example)["th_text"]

        prediction = translate_sentence(model, src, english, thai, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)
def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    print('saving')
    print()
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, '/content/checkpoint-NMT')
    torch.save(model.state_dict(),'/content/checkpoint-NMT-SD')

In [238]:
epoch_loss = 0.0
num_epochs = 100
best_loss = 999999
best_epoch = -1
sentence1 = "This is a dog"
ts1 = []

In [245]:
def loadCheckpoint():
    global epoch_loss, best_epoch, best_loss, model, optimizer, torch
    checkpoint_NMT = torch.load('./checkpoint-NMT')
    model.load_state_dict(checkpoint_NMT['model'].state_dict())
    optimizer.load_state_dict(checkpoint_NMT['optimizer'])
    best_epoch = checkpoint_NMT['epoch']
    best_loss = checkpoint_NMT['best_loss']
    epoch_loss = checkpoint_NMT['best_loss']
    torch.set_rng_state(checkpoint_NMT['rng_state'])
loadCheckpoint()

In [249]:
epoch_loss, best_loss, best_epoch

(37662.80535519123, 37662.80535519123, 0)

In [225]:
continue_epoch = best_epoch + 1
for epoch in range(continue_epoch ,num_epochs):
  print("Epoch - {} / {}".format(epoch+1, num_epochs))
  model.eval()
  translated_sentence1 = translate_sentence(model, sentence1, english, thai, device, max_length=50)
  print(f"Translated example sentence 1: \n {translated_sentence1}")
  ts1.append(translated_sentence1)

  model.train(True)
  for batch_idx, batch in enumerate(train_iterator):
    input = batch.en_text.to(device)
    target = batch.th_text.to(device)

    # Pass the input and target for model's forward method
    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    # Clear the accumulating gradients
    optimizer.zero_grad()

    # Calculate the loss value for every epoch
    loss = criterion(output, target)

    # Calculate the gradients for weights & biases using back-propagation
    loss.backward()

    # Clip the gradient value is it exceeds > 1
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    # Update the weights values using the gradients we calculated using bp 
    optimizer.step()
    step += 1
    epoch_loss += loss.item()
    writer.add_scalar("Training loss", loss, global_step=step)

  if epoch_loss < best_loss:
    best_loss = epoch_loss
    best_epoch = epoch
    checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss) 
    if ((epoch - best_epoch) >= 10):
      print("no improvement in 10 epochs, break")
      break
  print("Epoch_Loss - {}".format(loss.item()))
  print()
  
print(epoch_loss / len(train_iterator))

#score = bleu(valid_data[1:100], model, english, thai, device)
#print(f"Bleu score {score*100:.2f}")

Epoch - 1 / 100
Translated example sentence 1: 
 ['Joe', 'หนังสือประวัติศาสตร์', 'ต้อ', '"?', 'คล้อง', 'ต้อ', 'แคตตาล็อก', 'การสืบสวน', 'เกียร์', 'Ethernet', 'เกียร์', 'บอก', 'หวือหวา', 'แม้แต่', '!"', '!"', 'อัตชีวประวัติ', 'กี้', 'กี้', 'เนื่องเพราะ', 'สหภาพโซเวียต', 'เนื่องเพราะ', 'สหภาพโซเวียต', 'pdf', 'pdf', 'pdf', '52', '52', 'เด็กหญิง', 'Iron', 'ตัก', 'ตัก', 'ก็ได้', 'ก็ได้', 'ก็ได้', 'กี้', 'กี้', 'จุดสิ้นสุด', 'มีรส', 'มีรส', 'iBook', 'มีรส', 'iBook', 'กําเนิด', 'เอื้อมมือ', 'มีรส', 'มีรส', 'เอื้อมมือ', 'มีรส', 'Crichton']
saving

Epoch_Loss - 3.5097827911376953

Epoch - 2 / 100
Translated example sentence 1: 
 ['<unk>', '<unk>', '<unk>', '<eos>']
Epoch_Loss - 3.948538064956665

Epoch - 3 / 100
Translated example sentence 1: 
 ['<unk>', '<eos>']
Epoch_Loss - 3.565189838409424

Epoch - 4 / 100
Translated example sentence 1: 
 ['<unk>', 'ไม่', '<eos>']
Epoch_Loss - 4.102074146270752

Epoch - 5 / 100
Translated example sentence 1: 
 ['<unk>', 'มี', '<unk>', 'แผ่', '<eos>']
Epoch_

KeyboardInterrupt: ignored