# Machine Translation

In [1]:


#!pip install data_loader
import utils
import config
import logging
import numpy as np
from tqdm import tqdm

import torch
from torch.utils.data import DataLoader
#%cd Transformer-Pytorch
from data_loader import MTDataset
from model import make_model , LabelSmoothing
from train import MultiGPULossCompute
from utils import france_tokenizer_load
from beam_decoder import beam_search
from model import batch_greedy_decode
import sacrebleu
import warnings
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import json

files = ['train', 'dev', 'test']
fr_path ='./data/corpus.fr'
en_path ='./data/corpus.en'
fr_lines = []
en_lines = []

for file in files:
    corpus = json.load(open('./data/json/' + file +'.json', 'r', encoding="utf8"))
    for item in corpus.values():
        fr_lines.append(item[1]+'\n')
        en_lines.append(item[0] + '\n')

with open(fr_path, "w", encoding="utf8") as fch:
    fch.writelines(fr_lines)

with open(en_path, "w", encoding="utf8") as fen:
    fen.writelines(en_lines)

print("lines of france: ", len(fr_lines))

print("lines of English: ", len(en_lines))
print(" -------- Get Corpus ! -------- ")

lines of france:  112000
lines of English:  112000
 -------- Get Corpus ! -------- 


In [3]:
import sentencepiece as spm

def train(input_file, vocab_size, model_name, model_type, character_coverage):
  
    input_argument = '--input=%s --model_prefix=%s --vocab_size=%s --model_type=%s --character_coverage=%s ' \
                     '--pad_id=0 --unk_id=1 --bos_id=2 --eos_id=3' % (
                         input_file, model_name, vocab_size, model_type, character_coverage
                     )
    spm.SentencePieceTrainer.Train(input_argument)


def piece_run():
    # ===== English =====
    en_input = './data/corpus.en'  # raw input corpus file
    en_vocab_size = 32000
    en_model_name = './tokenizer/eng'
    en_model_type = 'bpe'
    en_character_coverage = 1.0
    train(en_input, en_vocab_size, en_model_name, en_model_type, en_character_coverage)

    # ===== French =====
    fr_input = './data/corpus.fr'
    fr_vocab_size = 32000
    fr_model_name = './tokenizer/fra'
    fr_model_type = 'bpe'
    fr_character_coverage = 0.9995
    train(fr_input, fr_vocab_size, fr_model_name, fr_model_type, fr_character_coverage)


# Run the tokenizer training
#piece_run()

import sentencepiece as spm

# Load the existing models
sp_en = spm.SentencePieceProcessor()
sp_en.Load('./tokenizer/eng.model')

sp_fr = spm.SentencePieceProcessor()
sp_fr.Load('./tokenizer/fra.model')

# Test
print(sp_en.EncodeAsPieces("Hello, how are you?"))
print(sp_fr.EncodeAsPieces("Bonjour, comment allez-vous?"))


['▁Hello', ',', '▁how', '▁are', '▁you', '?']
['▁Bonjour', ',', '▁comment', '▁allez', '-', 'vous', '?']


In [4]:
#!unzip -d ./tokenizer ./tokenizer/token.zip

In [5]:
print("GPU quantity:", torch.cuda.device_count())
print("Whether GPU is available:", torch.cuda.is_available())
import os 
os.environ['CUDA_VISIBLE_DEVICES']='0'

GPU quantity: 1
Whether GPU is available: True


In [6]:
import torch
# Model dimensions
d_model=512
n_heads=8
n_layers=6
d_k=64
d_v=64
d_ff=2048
dropout=0.1

# Special label index
padding_idx=0
bos_idx=2
eos_idx=3

#Vocabulary size
src_vocab_size=32000
tgt_vocab_size=32000
#Training parameters
batch_size=32
epoch_num=2
early_stop=5
lr=3e-5

#Decoding partameters
max_len=60
beam_size=3

#other options
use_smoothing=False
use_naomopt=True
# Data model path
data_dir='./data'
train_data_path='./data/json/train.json'
dev_data_path='./data/json/dev.json'
test_data_path='./data/json/test.json'
model_path='./experiment/model.pth'
pretrain_model_path='./pretrain/model.pth'
log_path='./experiment/train.log'
output_path='./experiment/output.txt'

# GPU and device setting
gpu_id=''
device_id=[0]
#set device
if gpu_id!='':
    device=torch.device(f"cuda:{gpu_id}")
else:
    device=torch.device('cpu')

### Obtain data

In [7]:
import json
import numpy as np
import torch
from torch.autograd import Variable
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from utils import english_tokenizer_load, france_tokenizer_load

from utils import english_tokenizer_load, france_tokenizer_load
import config

DEVICE = config.device


# Generate subsequent positional mask (for decoder)
def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = np.triu(np.ones(attn_shape), k=1).astype("uint8")
    return torch.from_numpy(subsequent_mask) == 0


class Batch:
    def __init__(self, src, trg=None, pad=0, src_text=None, trg_text=None):
        self.src_text = src_text
        self.trg_text = trg_text
        self.src = src.to(DEVICE)
        self.src_mask = (src != pad).unsqueeze(-2)
        if trg is not None:
            self.trg = trg[:, :-1].to(DEVICE)
            self.trg_y = trg[:, 1:].to(DEVICE)
            self.trg_mask = self.make_std_mask(self.trg, pad)
            self.ntokens = (self.trg_y != pad).data.sum()


    @staticmethod
    def make_std_mask(trg, pad):
        trg_mask = (trg != pad).unsqueeze(-2)
        trg_mask = trg_mask & subsequent_mask(trg.size(-1)).type_as(trg_mask.data)
        return trg_mask


class MTDataset(Dataset):
    def __init__(self, data_path):
        # Load English and French sentences
        self.out_en_sent, self.out_fr_sent = self.get_dataset(data_path, sort=True)

        # Load tokenizers
        self.sp_eng = english_tokenizer_load()
        self.sp_fra = france_tokenizer_load()

        # Special tokens
        self.PAD = self.sp_eng.pad_id()
        self.BOS = self.sp_eng.bos_id()
        self.EOS = self.sp_eng.eos_id()

    @staticmethod
    def len_argsort(seq):
        """Sort indexes based on the sequence length."""
        return sorted(range(len(seq)), key=lambda x: len(seq[x]))
    def get_dataset(self, data_path, sort=False):
        dataset = json.load(open(data_path, 'r'))  # load the JSON dict
        out_en_sent = []
        out_fr_sent = []

        # iterate over the dictionary keys in order
        for key in sorted(dataset.keys(), key=int):
            out_en_sent.append(dataset[key][0])  # English sentence
            out_fr_sent.append(dataset[key][1])  # French sentence

        if sort:
            sorted_index = self.len_argsort(out_en_sent)
            out_en_sent = [out_en_sent[i] for i in sorted_index]
            out_fr_sent = [out_fr_sent[i] for i in sorted_index]

        return out_en_sent, out_fr_sent



    def __getitem__(self, idx):
        eng_text = self.out_en_sent[idx]
        fra_text = self.out_fr_sent[idx]
        src = [self.BOS] + self.sp_eng.EncodeAsIds(eng_text) + [self.EOS]
        trg = [self.BOS] + self.sp_fra.EncodeAsIds(fra_text) + [self.EOS]
        return [src, trg, fra_text]  # include raw text

    def __len__(self):
        """Return the number of examples in the dataset"""
        return len(self.out_en_sent)
    def collate_fn(self, batch):
        src_tokens = [torch.tensor(sample[0], dtype=torch.long) for sample in batch]
        tgt_tokens = [torch.tensor(sample[1], dtype=torch.long) for sample in batch]
        trg_text = [sample[2] for sample in batch]

        src_tensor = pad_sequence(src_tokens, batch_first=True, padding_value=0)
        tgt_tensor = pad_sequence(tgt_tokens, batch_first=True, padding_value=0)

        #return Batch(src_tensor, tgt_tensor, trg_text=trg_text)
        return Batch(src_tensor, tgt_tensor, pad=0, trg_text=trg_text)
   



In [8]:
train_dataset = MTDataset(config.train_data_path)
dev_dataset = MTDataset(config.dev_data_path)
test_dataset = MTDataset(config.test_data_path)

# logging.info("
print(" -------- Dataset Build! -")
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=config.batch_size,
collate_fn=train_dataset.collate_fn)
dev_dataloader = DataLoader(dev_dataset, shuffle=False, batch_size=config.batch_size,
collate_fn=dev_dataset.collate_fn)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=config.batch_size,
collate_fn=test_dataset.collate_fn)

 -------- Dataset Build! -


### Define the learning rate initialize model and define loss fn

In [9]:
import torch

class NoamOpt:
    """Optim wrapper that implements the Noam learning rate schedule."""

    def __init__(self, model_size, factor, warmup, optimizer):
        self.optimizer = optimizer
        self._step = 0
        self.warmup = warmup
        self.factor = factor
        self.model_size = model_size
        self._rate = 0

    def step(self):
        """Update parameters and rate"""
        self._step += 1
        rate = self.rate()
        for p in self.optimizer.param_groups:
            p['lr'] = rate  # Corrected from 'Ir' to 'lr'
        self._rate = rate
        self.optimizer.step()

    def rate(self, step=None):
        """Compute learning rate at current step"""
        if step is None:
            step = self._step
        return self.factor * (self.model_size ** (-0.5) *
                              min(step ** (-0.5), step * self.warmup ** (-1.5)))


def get_std_opt(model):
    """Standard Noam optimizer for a model"""
    return NoamOpt(
        model_size=model.src_embed[0].d_model,  # Assuming src_embed is a list or nn.ModuleList
        factor=1,
        warmup=10000,
        optimizer=torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)
    )


# Initialize the model
model = make_model(
    config.src_vocab_size,
    config.tgt_vocab_size,
    config.n_layers,
    config.d_model,
    config.d_ff,
    config.n_heads,
    config.dropout
)

# Load pre-trained weights
model.load_state_dict(torch.load(config.pretrain_model_path))

# Wrap in DataParallel if multiple GPUs
model_par = torch.nn.DataParallel(model)

# Loss function and optimizer
if config.use_smoothing:
    criterion = LabelSmoothing(
        size=config.tgt_vocab_size,
        padding_idx=config.padding_idx,
        smoothing=0.1
    )
    criterion.cuda()
else:
    criterion = torch.nn.CrossEntropyLoss(ignore_index=0, reduction='sum')

if config.use_noamopt:
    optimizer = get_std_opt(model)
else:
    optimizer = torch.optim.AdamW(model.parameters(), lr=config.lr)


### Define training and verify the functions

In [10]:
import torch
from tqdm import tqdm
import logging
import sacrebleu

def run_epoch(data, model, loss_compute):
    total_tokens = 0
    total_loss = 0

    for batch in tqdm(data):
        out = model(batch.src, batch.trg, batch.src_mask, batch.trg_mask)
        loss = loss_compute(out, batch.trg_y, batch.ntokens)
        total_loss += loss
        total_tokens += batch.ntokens
    
    return total_loss / total_tokens


def train(train_data, dev_data, model, model_par, criterion, optimizer):
    """Training loop with early stopping and BLEU evaluation."""
    best_bleu_score = 0.0
    early_stop_counter = config.early_stop

    for epoch in range(1, config.epoch_num + 1):
        # ===== Training =====
        model.train()
        train_loss = run_epoch(
            train_data, 
            model_par,
            MultiGPULossCompute(model.generator, criterion, config.device_id, optimizer)
        )
        print("Epoch: {}, Train loss: {:.4f}".format(epoch, train_loss))

        # ===== Evaluation =====
        model.eval()
        dev_loss = run_epoch(
            dev_data, 
            model_par,
            MultiGPULossCompute(model.generator, criterion, config.device_id, None)
        )

        bleu_score = evaluate(dev_data, model)
        print("Epoch: {}, Dev loss: {:.4f}, BLEU Score: {:.2f}".format(epoch, dev_loss, bleu_score))

        # ===== Checkpointing =====
        if bleu_score > best_bleu_score:
            torch.save(model.state_dict(), config.model_path)
            best_bleu_score = bleu_score
            early_stop_counter = config.early_stop
            print(" -------- Save Best Model! -------- ")
        else:
            early_stop_counter -= 1
            print("Early Stop Left: {}".format(early_stop_counter))
            if early_stop_counter == 0:
                print(" -------- Early Stop! -------- ")
                break


def evaluate(data, model, mode='dev', use_beam=True):
    sp_fra = france_tokenizer_load()
    trg = []
    res = []
    first_batch = True  # only print once
    with torch.no_grad():
        for batch in tqdm(data):
            fr_sent = batch.trg_text
            src = batch.src
            src_mask = (src != 0).unsqueeze(-2)
            if use_beam:
                decode_result, _ = beam_search(
                    model, src, src_mask, config.max_len,
                    config.padding_idx, config.bos_idx, config.eos_idx,
                    config.beam_size, config.device
                )
            else:
                decode_result = batch_greedy_decode(
                    model, src, src_mask, max_len=config.max_len
                )

            # ===== DEBUG: add these lines =====
            if first_batch:
                print("type(decode_result):", type(decode_result))
                print("len(decode_result):", len(decode_result))
                print("type(decode_result[0]):", type(decode_result[0]))
                print("decode_result[0]:", decode_result[0][:5])  # first 5 elements
                if isinstance(decode_result[0], list) and len(decode_result[0]) > 0:
                    print("type(decode_result[0][0]):", type(decode_result[0][0]))
                    print("decode_result[0][0]:", decode_result[0][0])
                first_batch = False
            # ===== END DEBUG =====

            # Then try decoding based on what you see
            translation = [sp_fra.decode_ids([int(x) for x in _s[0]]) for _s in decode_result]
            trg.extend(fr_sent)
            res.extend(translation)

    bleu = sacrebleu.corpus_bleu(res, [trg])
    return float(bleu.score)


In [11]:
print("------------- Get Dataloader-------------")
train(train_dataloader, dev_dataloader, model, model_par,criterion, optimizer)

------------- Get Dataloader-------------


100%|██████████| 3125/3125 [03:54<00:00, 13.31it/s]


Epoch: 1, Train loss: 3.9138


100%|██████████| 63/63 [00:01<00:00, 41.81it/s]
  2%|▏         | 1/63 [00:00<00:10,  5.86it/s]

type(decode_result): <class 'list'>
len(decode_result): 32
type(decode_result[0]): <class 'list'>
decode_result[0]: [[2986, 145, 3], [2179, 145, 3], [2986, 31951, 107]]
type(decode_result[0][0]): <class 'list'>
decode_result[0][0]: [2986, 145, 3]


100%|██████████| 63/63 [00:14<00:00,  4.43it/s]


Epoch: 1, Dev loss: 1.9242, BLEU Score: 23.83
 -------- Save Best Model! -------- 


100%|██████████| 3125/3125 [03:53<00:00, 13.40it/s]


Epoch: 2, Train loss: 1.9159


100%|██████████| 63/63 [00:01<00:00, 41.44it/s]
  2%|▏         | 1/63 [00:00<00:11,  5.27it/s]

type(decode_result): <class 'list'>
len(decode_result): 32
type(decode_result[0]): <class 'list'>
decode_result[0]: [[2179, 145, 3], [2179, 1668, 145], [2179, 70, 145]]
type(decode_result[0][0]): <class 'list'>
decode_result[0][0]: [2179, 145, 3]


100%|██████████| 63/63 [00:15<00:00,  4.17it/s]


Epoch: 2, Dev loss: 1.6383, BLEU Score: 29.58
 -------- Save Best Model! -------- 


### Test the model with the test dataset

In [13]:
def test(data, model, criterion):
    with torch.no_grad():
    # Load the model.
        model.load_state_dict(torch.load(config.model_path))
        model_par =torch.nn.DataParallel(model)
        model.eval()
        
        test_loss = run_epoch(data, model_par,
        MultiGPULossCompute(model.generator, criterion, config.device_id, None))
        bleu_score = evaluate(data, model, 'test')
        
        print('Test loss: {}, Bleu Score: {}'.format(test_loss, bleu_score))

test(test_dataloader, model, criterion)


100%|██████████| 313/313 [00:06<00:00, 49.69it/s]
  0%|          | 1/313 [00:00<00:59,  5.29it/s]

type(decode_result): <class 'list'>
len(decode_result): 32
type(decode_result[0]): <class 'list'>
decode_result[0]: [[2179, 145, 3], [2179, 1668, 145], [2179, 70, 145]]
type(decode_result[0][0]): <class 'list'>
decode_result[0][0]: [2179, 145, 3]


100%|██████████| 313/313 [01:29<00:00,  3.52it/s]


Test loss: 1.4554940462112427, Bleu Score: 31.12699291913909


### Make inference 

In [19]:
from utils import english_tokenizer_load_inf
from utils import france_tokenizer_load_inf
import torch
import numpy as np

def translate(src, model, use_beam=True):
    """Use the trained model to translate a single sentence and print the translation result."""
    sp_fra = france_tokenizer_load_inf()
    with torch.no_grad():
        model.load_state_dict(torch.load(config.model_path))
        model.eval()
        src_mask = (src != 0).unsqueeze(-2)  
        if use_beam:
            decode_result, _ = beam_search(
                model, src, src_mask, config.max_len,
                config.padding_idx, config.bos_idx, config.eos_idx,
                config.beam_size, config.device
            )
            decode_result = [h[0] for h in decode_result]  
        else:
            decode_result = batch_greedy_decode(model, src, src_mask, max_len=config.max_len)
        
        translation = [sp_fra.decode_ids(_s) for _s in decode_result]
        print(translation[0])


def one_sentence_translate(sent, beam_search_flag=True):
    """Translate a single English sentence."""
    # Initialize the model
    model = make_model(
        config.src_vocab_size, config.tgt_vocab_size, config.n_layers,
        config.d_model, config.d_ff, config.n_heads, config.dropout
    )
    BOS = english_tokenizer_load_inf().bos_id()  # typically 2
    EOS = english_tokenizer_load_inf().eos_id()  # typically 3
    src_tokens = [[BOS] + english_tokenizer_load_inf().EncodeAsIds(sent) + [EOS]]
    batch_input = torch.LongTensor(np.array(src_tokens)).to(config.device)
    translate(batch_input, model, use_beam=beam_search_flag)


def translate_example():
    """Translation result example of a single sentence"""
    sent = "He believes that he can prove it"
    one_sentence_translate(sent, beam_search_flag=True)


translate_example()


Il croit cela peut le prouver.
