<a href="https://colab.research.google.com/github/ShahinGanbar/ENG-AZE_content_aware_translator/blob/main/ENG_AZE_context_aware_translator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
!nvidia-smi  # Check GPU
!pip install transformers datasets pandas tokenizers sentencepiece sacrebleu


Sun Jul 20 19:06:20 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   71C    P0             29W /   70W |    3046MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [13]:
!pip install huggingface_hub




In [19]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGrained).
The token `az_corpus` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `az_corpu

In [23]:
from google.colab import files
uploaded = files.upload()


Saving train.jsonl to train.jsonl


In [25]:
import json

filename = list(uploaded.keys())[0]  # get the uploaded filename
data = []

with open(filename, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

print(data[0])  # print first example


{'id': 0, 'translation': {'en': 'Good morning, ladies and gentlemen!', 'aze': 'Sabahınız xeyir, xanımlar vә cәnablar!'}}


In [26]:
english_sentences = []
azerbaijani_sentences = []

for example in data:
    english_sentences.append(example['translation']['en'])
    azerbaijani_sentences.append(example['translation']['aze'])

# quick check
print(english_sentences[:3])
print(azerbaijani_sentences[:3])


['Good morning, ladies and gentlemen!', 'I give you my word.', 'Good morning.']
['Sabahınız xeyir, xanımlar vә cәnablar!', 'Sizә söz verirәm.', 'Gün aydın!']


In [27]:
import pandas as pd

df = pd.DataFrame({
    "en": english_sentences,
    "aze": azerbaijani_sentences
})

print(df.head())


                                    en                                     aze
0  Good morning, ladies and gentlemen!  Sabahınız xeyir, xanımlar vә cәnablar!
1                  I give you my word.                       Sizә söz verirәm.
2                        Good morning.                              Gün aydın!
3                        Which is new?                         Nə var, nə yox?
4                                Yeah.                                     Hə.


In [28]:
with open("train_en.txt", "w", encoding="utf-8") as f_en, open("train_az.txt", "w", encoding="utf-8") as f_az:
    for en_sent, az_sent in zip(english_sentences, azerbaijani_sentences):
        f_en.write(en_sent + "\n")
        f_az.write(az_sent + "\n")


In [30]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# Initialize a tokenizer with BPE model
tokenizer = Tokenizer(models.BPE())

# Setup trainer with special tokens
trainer = trainers.BpeTrainer(
    vocab_size=30_000,
    special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
)

# Set whitespace pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Files to train tokenizer on
files = ["train_en.txt", "train_az.txt"]

# Train tokenizer
tokenizer.train(files, trainer)

# Save tokenizer
tokenizer.save("bpe_tokenizer.json")


In [31]:
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_file("bpe_tokenizer.json")

# Encode some text
encoded = tokenizer.encode("Good morning, ladies and gentlemen!")
print("Tokens:", encoded.tokens)
print("IDs:", encoded.ids)


Tokens: ['Good', 'morning', ',', 'ladies', 'and', 'gentlemen', '!']
IDs: [1930, 1545, 10, 10195, 213, 14165, 5]


In [32]:
inputs = [tokenizer.encode(text).ids for text in english_sentences]
targets = [tokenizer.encode(text).ids for text in azerbaijani_sentences]

print(inputs[0])   # token IDs of first English sentence
print(targets[0])  # token IDs of first Azerbaijani sentence


[1930, 1545, 10, 10195, 213, 14165, 5]
[5259, 3584, 10, 14539, 9184, 16555, 5]


In [33]:
from transformers import MarianTokenizer, MarianMTModel
from torch.utils.data import DataLoader, Dataset
import torch

model_name = "Helsinki-NLP/opus-mt-en-az"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

class TranslationDataset(Dataset):
    def __init__(self, src_texts, tgt_texts):
        self.src_texts = src_texts
        self.tgt_texts = tgt_texts

    def __len__(self):
        return len(self.src_texts)

    def __getitem__(self, idx):
        src = tokenizer(self.src_texts[idx], return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        tgt = tokenizer(self.tgt_texts[idx], return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        return {
            "input_ids": src.input_ids.squeeze(),
            "attention_mask": src.attention_mask.squeeze(),
            "labels": tgt.input_ids.squeeze()
        }

dataset = TranslationDataset(english_sentences, azerbaijani_sentences)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# You can then train the model with your usual PyTorch training loop


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/451k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/470k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/226M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/226M [00:00<?, ?B/s]

In [38]:
from torch.optim import AdamW
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
epochs = 3

for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    total_loss = 0

    for batch in tqdm(dataloader):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Ignore [PAD] tokens in loss calculation
        labels[labels == tokenizer.pad_token_id] = -100

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Average loss: {total_loss / len(dataloader):.4f}")



Epoch 1/3


100%|██████████| 323/323 [01:17<00:00,  4.15it/s]


Average loss: 2.8738

Epoch 2/3


100%|██████████| 323/323 [01:16<00:00,  4.21it/s]


Average loss: 1.8434

Epoch 3/3


100%|██████████| 323/323 [01:21<00:00,  3.95it/s]

Average loss: 1.3945





In [42]:
model.eval()

def translate(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(device)
    translated = model.generate(**inputs, max_length=128)
    return tokenizer.decode(translated[0], skip_special_tokens=True)

print(translate("She turned it down"))


O , kömək etdi


In [41]:
import sacrebleu

# Example: Evaluate on first 100 sentences
preds = [translate(s) for s in english_sentences[:100]]
refs = [[t] for t in azerbaijani_sentences[:100]]

bleu = sacrebleu.corpus_bleu(preds, refs)
print("BLEU score:", bleu.score)


BLEU score: 13.134549472120788
