# installations

In [1]:
# !pip install -U 'spacy[cuda-autodetect]' -q

In [None]:
!python -m spacy download en_core_web_sm
!pip install datasets

# !python -m spacy download es_core_news_sm

# imports

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import math
import copy
from functools import partial
import pandas as pd

from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [5]:
random_seed = 42

# multi head attention

In [6]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, Q, K, V, mask=None):
        attn_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, -1e9)

        attn_probs = torch.softmax(attn_scores, dim=-1)
        output = torch.matmul(attn_probs, V)
        return output

    def split_heads(self, x):
        batch_size, seq_len, d_model = x.size()
        return x.reshape(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)

    def combine_heads(self, x):
        batch_size, _, seq_len, d_k = x.size()
        return x.transpose(1, 2).reshape(batch_size, seq_len, self.d_model)

    def forward(self, Q, K, V, mask=None):
        Q = self.split_heads(self.W_q(Q))
        K = self.split_heads(self.W_k(K))
        V = self.split_heads(self.W_v(V))

        attn_output = self.scaled_dot_product_attention(Q, K, V, mask)
        output = self.W_o(self.combine_heads(attn_output))
        return output


# PositionWiseFeedForward

In [7]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# positional encoding

In [8]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_seq_length, d_model, device=device)
        position = torch.arange(0, max_seq_length, dtype=torch.float, device=device).unsqueeze(1)
        div_term = torch.pow(10_000, (-torch.arange(0, d_model, 2, device=device).float() / d_model))

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        return self.register_buffer('pe', pe.unsqueeze(0))


    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

# encoder

In [9]:
class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.self_attn(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))
        return x


# decoder layer

In [10]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()

        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)


    def forward(self, x, enc_output, src_mask, tgt_mask):
        attn_output = self.self_attn(x, x, x, tgt_mask)
        x = self.norm1(x + self.dropout(attn_output))
        attn_output = self.cross_attn(x, enc_output, enc_output, src_mask)
        x = self.norm2(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

# transformer

In [11]:
class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout):
        super(Transformer, self).__init__()
        self.encoder_embedding = nn.Embedding(src_vocab_size, d_model)
        self.decoder_embedding = nn.Embedding(tgt_vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model, max_seq_length)

        self.encoder_layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])

        self.fc = nn.Linear(d_model, tgt_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def generate_mask(self, src, tgt):
        src_mask = (src != 0).unsqueeze(1).unsqueeze(2)
        tgt_mask = (tgt != 0).unsqueeze(1).unsqueeze(3)
        seq_length = tgt.size(1)
        nopeak_mask = (1 - torch.triu(torch.ones(1, seq_length, seq_length, device=device), diagonal=1)).bool()
        tgt_mask = tgt_mask & nopeak_mask
        return src_mask, tgt_mask

    def forward(self, src, tgt):
        src_mask, tgt_mask = self.generate_mask(src, tgt)

        src_embedded = self.dropout(self.positional_encoding(self.encoder_embedding(src)))
        tgt_embedded = self.dropout(self.positional_encoding(self.decoder_embedding(tgt)))

        enc_output = src_embedded
        for enc_layer in self.encoder_layers:
            enc_output = enc_layer(enc_output, src_mask)

        dec_output = tgt_embedded
        for dec_layer in self.decoder_layers:
            dec_output = dec_layer(dec_output, enc_output, src_mask, tgt_mask)

        output = self.fc(dec_output)
        return output

# data

## change these paths

In [12]:
lines_en = open('/kaggle/input/translation-data-full/train.en.txt', encoding='utf-8').\
        read().strip().split('\n')
lines_vi = open('/kaggle/input/translation-data-full/train.vi.txt', encoding='utf-8').\
        read().strip().split('\n')

lines_en_val = open('/kaggle/input/translation-data-full/val.en.txt', encoding='utf-8').\
        read().strip().split('\n')
lines_vi_val = open('/kaggle/input/translation-data-full/val.vi.txt', encoding='utf-8').\
        read().strip().split('\n')

lines_en_test = open('/kaggle/input/translation-data-full/test.en.txt', encoding='utf-8').\
        read().strip().split('\n')
lines_vi_test = open('/kaggle/input/translation-data-full/test.vi.txt', encoding='utf-8').\
        read().strip().split('\n')


In [13]:
lines_train = []
lines_val = []
lines_test = []

for i in range(len(lines_en)):
    try:
        lmao = '\t'.join([lines_en[i], lines_vi[i]])
        lines_train.append(lmao)
    except Exception as e:
        print(f"Unexpected error: {e}")
        continue
for i in range(len(lines_en_val)):
    try:
        lmao = '\t'.join([lines_en_val[i], lines_vi_val[i]])
        lines_val.append(lmao)
    except Exception as e:
        print(f"Unexpected error: {e}")
        continue
for i in range(len(lines_en_val)):
    try:
        lmao = '\t'.join([lines_en_test[i], lines_vi_test[i]])
        lines_test.append(lmao)
    except Exception as e:
        print(f"Unexpected error: {e}")
        continue

train_lines = lines_train
val_lines = lines_val
test_lines = lines_test

In [16]:
import re
def clean_text(sentence):
    sentence = re.sub(r" &quot;", "\"", sentence)  # Handle &quot;
    sentence = re.sub(r" &apos;", "'", sentence)  # Handle &apos;
    sentence = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(sentence))
    sentence = re.sub(r"[ ]+", " ", sentence)
    sentence = re.sub(r"\!+", "!", sentence)
    sentence = re.sub(r"\,+", ",", sentence)
    sentence = re.sub(r"\?+", "?", sentence)

    # Lowercase the sentence
    sentence = sentence.lower()

    return sentence

In [17]:
train_lines = [clean_text(line) for line in train_lines]
val_lines = [clean_text(line) for line in val_lines]
test_lines = [clean_text(line) for line in test_lines]

In [18]:
test_lines[:5]

['borders between the bosniak territory and the bosnian serb territory have calmed down .\tbiên giới giữa lãnh thỗ người bosnia theo đạo hồi và lãnh thổ người bosnia serbi đã bình ổn .',
 'we created the responsive room where the lights music and blinds adjusted to your state .\tchúng tôi đã tạo ra một phòng cảm ứng trong đó ánh sáng , nhạc và rèm cửa được tuỳ chỉnh theo trạng thái của bạn .',
 'we are seeing the rise of female sexual expression .\tchúng ta đang thấy sự trỗi dậy trong việc biểu lộ giới tính của phụ nữ .',
 'now , the first of these transformations is going to happen anyway .\thiện giờ , cuộc biến đổi đầu tiên rốt cuộc cũng sắp sửa xảy ra .',
 'do you know what it is ? anyone ?\tcác bạn có biết là gì không ? có ai biết không ?']

# Preprocess Data

In [19]:
SRC_LANGUAGE = "en"
TGT_LANGUAGE = "vi"

In [20]:
tokenizer = {}
tokenizer[SRC_LANGUAGE] = get_tokenizer("spacy", "en_core_web_sm")
tokenizer[TGT_LANGUAGE] = get_tokenizer("spacy", "en_core_web_sm")

## make dataset

In [21]:
class SentencePairDataset(Dataset):
    def __init__(self, lines, src_tokenizer, tgt_tokenizer):
        super(SentencePairDataset, self).__init__()

        self.lines = lines
        self.src_tokenizer = src_tokenizer
        self.tgt_tokenizer = tgt_tokenizer

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        line = self.lines[idx]

        src, tgt = line.split('\t')
        src_tokens = self.src_tokenizer(src)
        tgt_tokens = self.tgt_tokenizer(tgt)

        return src_tokens, tgt_tokens

In [22]:
train_ds = SentencePairDataset(train_lines, tokenizer[SRC_LANGUAGE], tokenizer[TGT_LANGUAGE])
val_ds = SentencePairDataset(val_lines, tokenizer[SRC_LANGUAGE], tokenizer[TGT_LANGUAGE])
test_ds = SentencePairDataset(test_lines, tokenizer[SRC_LANGUAGE], tokenizer[TGT_LANGUAGE])

## vocabulary

In [23]:
vocab = {}

In [24]:
src_vocab_size = 10_000
tgt_vocab_size = 10_000
max_seq_len = 100

PAD_IDX = 0
UNK_IDX = 1
BOS_IDX = 2
EOS_IDX = 3

special_symbols = ['<PAD>', '<UNK>', '<BOS>', '<EOS>']

In [25]:
def yield_tokens(dataset, lang_idx=0):
    n = len(dataset)
    i = 0

    while i < n:
        yield dataset[i][lang_idx]
        i += 1

In [26]:
src_iterator = yield_tokens(train_ds, lang_idx=0)
tgt_iterator = yield_tokens(train_ds, lang_idx=1)

In [27]:
vocab[SRC_LANGUAGE] = build_vocab_from_iterator(
    src_iterator,
    min_freq=1,
    specials=special_symbols,
    special_first=True,
    max_tokens=src_vocab_size,
)
vocab[TGT_LANGUAGE] = build_vocab_from_iterator(
    tgt_iterator,
    min_freq=1,
    specials=special_symbols,
    special_first=True,
    max_tokens=tgt_vocab_size,
)

In [28]:
vocab[SRC_LANGUAGE].set_default_index(UNK_IDX)
vocab[TGT_LANGUAGE].set_default_index(UNK_IDX)

In [29]:
def collate_fn(batch, vocab):
    batch_size = len(batch)
    srcs, tgts = zip(*batch)
    src_vectors = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)
    tgt_vectors = torch.zeros((batch_size, max_seq_len), dtype=torch.long, device=device)

    for i in range(batch_size):
        src_vectors[i] = torch.tensor(([BOS_IDX] + vocab[SRC_LANGUAGE](srcs[i]) + [EOS_IDX] + [0] * (max_seq_len - len(srcs[i])))[:max_seq_len], dtype=torch.long, device=device)
        tgt_vectors[i] = torch.tensor(([BOS_IDX] + vocab[TGT_LANGUAGE](tgts[i]) + [EOS_IDX] + [0] * (max_seq_len - len(tgts[i])))[:max_seq_len], dtype=torch.long, device=device)

    return src_vectors, tgt_vectors

In [30]:
train_dataloader = DataLoader(train_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn, vocab=vocab))
val_dataloader = DataLoader(val_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn, vocab=vocab))
test_dataloader = DataLoader(test_ds, batch_size=64, shuffle=True, collate_fn=partial(collate_fn, vocab=vocab))

# train

In [31]:
src_vocab_size = 10000
tgt_vocab_size = 10000
d_model = 512
num_heads = 4
num_layers = 6
d_ff = 2048
max_seq_length = 100
dropout = 0.1

transformer = Transformer(src_vocab_size, tgt_vocab_size, d_model, num_heads, num_layers, d_ff, max_seq_length, dropout).to(device)

criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(transformer.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [None]:
num_epochs = 5

transformer.train()

for epoch in range(num_epochs):
    print(f"Epoch: {epoch+1}\n------------------------------")
    transformer.train()
    for data in train_dataloader:
        src_data, tgt_data = data
        optimizer.zero_grad()
        output = transformer(src_data, tgt_data[:, :-1])
        loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
        loss.backward()
        optimizer.step()
        print(f"Epoch: {epoch+1}, Training Loss: {loss.item()}")

    transformer.eval()
    with torch.no_grad():
        for data in val_dataloader:
            src_data, tgt_data = data
            output = transformer(src_data, tgt_data[:, :-1])
            loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
            print(f"Epoch: {epoch+1}, Validation Loss: {loss.item()}")

    torch.save(transformer.state_dict(), f'./transformer_state_dict_epoch_{epoch+1}')

In [None]:
transformer.eval()
with torch.no_grad():
    for data in val_dataloader:
        src_data, tgt_data = data
        output = transformer(src_data, tgt_data[:, :-1])
        loss = criterion(output.contiguous().view(-1, tgt_vocab_size), tgt_data[:, 1:].contiguous().view(-1))
        print(f"Test Loss: {loss.item()}")

In [34]:
transformer.eval()


Transformer(
  (encoder_embedding): Embedding(10000, 512)
  (decoder_embedding): Embedding(10000, 512)
  (positional_encoding): PositionalEncoding()
  (encoder_layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (W_q): Linear(in_features=512, out_features=512, bias=True)
        (W_k): Linear(in_features=512, out_features=512, bias=True)
        (W_v): Linear(in_features=512, out_features=512, bias=True)
        (W_o): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): PositionWiseFeedForward(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (decoder_layers): ModuleList(
    (0-5): 6 x DecoderLayer(

In [35]:
def translate(src):
    src_tokens = tokenizer[SRC_LANGUAGE](src)
    tgt_tokens = ["<BOS>"]

    src_vectors = torch.tensor(([BOS_IDX] + vocab[SRC_LANGUAGE](src_tokens) + [EOS_IDX] + [0] * (max_seq_len - len(src_tokens)))[:max_seq_len], dtype=torch.long, device=device).unsqueeze(0)

    for i in range(max_seq_len):
        tgt_vectors = torch.tensor((vocab[TGT_LANGUAGE](tgt_tokens) + [0] * (max_seq_len - len(tgt_tokens)))[:max_seq_len], dtype=torch.long, device=device).unsqueeze(0)
        output = transformer(src_vectors, tgt_vectors)
        idx = torch.argmax(nn.functional.softmax(output, dim=2)[0][i]).item()
        tgt_tokens.append(vocab[TGT_LANGUAGE].lookup_token(idx))

        if idx == EOS_IDX:
            break

    return " ".join(tgt_tokens).replace("<BOS>", "").replace("<EOS>", "").replace("<PAD>", "").strip()

In [36]:
translate("i love you")

'tôi yêu quý vị'

In [39]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
smoother = SmoothingFunction()
def compute_bleu_score(transformer, src, ref_tgt, tokenizer, vocab, max_seq_len, device):
    generated_translation = translate(src)

    ref_tokens = tokenizer[TGT_LANGUAGE](ref_tgt)
    bleu_score = sentence_bleu([ref_tokens], generated_translation.split())
    return bleu_score


# Calculate BLEU

In [40]:
def compute_bleu_for_dataset(transformer, lines_en_val, lines_vi_val, tokenizer, vocab, max_seq_len, device):
    total_bleu_score = 0.0
    num_examples = 500

    for i in range(num_examples):
        if i % 25 == 0:
            print(f'currently at sentence number {i} out of {num_examples}')
        src_sentence = lines_en_val[i]
        ref_sentence = lines_vi_val[i]
        ref_sentence = clean_text(ref_sentence)
#         print(f'src_sentence {i}: ', src_sentence)
#         print(f'ref_sentence {i}: ', ref_sentence)
        bleu_score = compute_bleu_score(transformer, src_sentence, ref_sentence, tokenizer, vocab, max_seq_len, device)
#         print(f'bleu_score {i}: ', bleu_score)
        total_bleu_score += bleu_score

    average_bleu_score = total_bleu_score * 100/ num_examples
#     print(total_bleu_score)
#     print(num_examples)
    return average_bleu_score

average_bleu = compute_bleu_for_dataset(transformer, lines_en_val, lines_vi_val, tokenizer, vocab, max_seq_len, device)
print(f"Average BLEU Score on Validation Dataset: {average_bleu}")

currently at sentence number 0 out of 500


Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


currently at sentence number 25 out of 500
currently at sentence number 50 out of 500
currently at sentence number 75 out of 500
currently at sentence number 100 out of 500
currently at sentence number 125 out of 500
currently at sentence number 150 out of 500
currently at sentence number 175 out of 500
currently at sentence number 200 out of 500
currently at sentence number 225 out of 500
currently at sentence number 250 out of 500
currently at sentence number 275 out of 500
currently at sentence number 300 out of 500
currently at sentence number 325 out of 500
currently at sentence number 350 out of 500
currently at sentence number 375 out of 500
currently at sentence number 400 out of 500
currently at sentence number 425 out of 500
currently at sentence number 450 out of 500
currently at sentence number 475 out of 500
Average BLEU Score on Validation Dataset: 31.697098993675464


# Average BLEU Score on Validation Dataset: 31.697098993675464

In [51]:
torch.save(transformer, 'transformer_translation.pth')

# Inference 

## config the path to model .pth file

In [None]:
# change this path
transformer = torch.load('/kaggle/working/transformer_translation.pth')
transformer.eval()  

# Fine-tune

## config the path to model .pth file

In [44]:
from transformers import AutoTokenizer
import pandas as pd
import transformers
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForSeq2SeqLM


from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
model_checkpoint = "Helsinki-NLP/opus-mt-en-vi"
model_finetune = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer_finetune = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

# change this path
model_finetune.load_state_dict(torch.load('/kaggle/input/fine_tune/pytorch/default/1/6_8_24_model_finetune_translation.pth'))
model_finetune.eval()

MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(53685, 512, padding_idx=53684)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(53685, 512, padding_idx=53684)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [48]:
def translate_finetune_model(input_text):
    # input_text = "My mom once told me that as long as we put our mind into our work, we will succeed"
    inputs = tokenizer_finetune(input_text, return_tensors="pt")
    outputs = model_finetune.generate(
        inputs["input_ids"],
        max_length=50,  # Adjust max_length as needed
        num_beams=5,    # Adjust num_beams for beam search
        early_stopping=True
    )
    translated_text = tokenizer_finetune.decode(outputs[0], skip_special_tokens=True)
    
    return translated_text

In [49]:
def translate_from_scratch_model(src):
    return translate(src)

In [None]:
!pip install gradio

In [None]:
import gradio as gr

# Example sentences used for translation
example_sentences = [
    "What is the weather like today?",
    "Hello everyone, this is our school, we are happy to have you here.",
    "How was your first day of school?",
]
translation_history = []

def translate_text(input_text):
    # Placeholder function to simulate translation
    translated_text_model_1 = translate_from_scratch_model(input_text)
    translated_text_model_2 = translate_finetune_model(input_text)

    # Save translation history
    translation_history.append({
        "original_text": input_text,
        "model_1_translation": translated_text_model_1,
        "model_2_translation": translated_text_model_2,
    })

    # Format history for display
    formatted_history = [
        [
            entry["original_text"],
            entry["model_1_translation"],
            entry["model_2_translation"]
        ] for entry in translation_history
    ]

    return translated_text_model_1, translated_text_model_2, formatted_history

def interface():
    with gr.Blocks() as demo:
        gr.Markdown("## Translation Interface")

        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label="Enter text to translate")
                example_sentences_box = gr.Textbox(label="Example Sentences", value="\n".join(example_sentences), interactive=False)
                translate_button = gr.Button("Translate")
            with gr.Column():
                model_1_output = gr.Textbox(label="Transformer from scratch Translation", interactive=False)
                model_2_output = gr.Textbox(label="Finetune Translation", interactive=False)
                history_output = gr.Dataframe(headers=["History translation", "Transformer from scratch Translation", "Finetune Translation"], row_count=5)

        translate_button.click(translate_text, inputs=text_input, outputs=[model_1_output, model_2_output, history_output])

    return demo

if __name__ == "__main__":
    demo = interface()
    demo.launch()
