In [1]:
!pip install transformers torch
!pip install -U datasets huggingface_hub fsspec
!pip install datasets

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [2]:

!pip install transformers datasets evaluate sacrebleu



In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = "facebook/nllb-200-distilled-600M"

tokenizer = AutoTokenizer.from_pretrained(model_name, src_lang="eng_Latn", tgt_lang="nso_Latn")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:




from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer

model2 = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
tokenizer2 = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="ns")

In [5]:
import huggingface_hub
huggingface_hub.login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

ACCESS TOKEN : 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

In [6]:
from datasets import load_dataset

# Load FLORES+ development splits
eng = load_dataset("openlanguagedata/flores_plus", "eng_Latn", split="dev")
nso_d = load_dataset("openlanguagedata/flores_plus", "nso_Latn", split="dev")

# Extract aligned English-Sepedi pairs (first N)
N = 5
src_texts = [eng[i]["text"] for i in range(N)]
ref_texts = [nso_d[i]["text"] for i in range(N)]

Resolving data files:   0%|          | 0/219 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/219 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/213 [00:00<?, ?it/s]

In [7]:
from sacrebleu import corpus_bleu, corpus_chrf
import difflib

# Translate with M2M100
m2m_pred_texts = []

# Set the source language for the tokenizer
tokenizer2.src_lang = "en"

# Use correct target language code for Sepedi
target_lang = "ns"

for text in src_texts:
    inputs = tokenizer2(text, return_tensors="pt")
    generated_tokens = model2.generate(
        **inputs,
        forced_bos_token_id=tokenizer2.get_lang_id(target_lang)
    )
    translation = tokenizer2.decode(generated_tokens[0], skip_special_tokens=True)
    m2m_pred_texts.append(translation)


# Compute BLEU and chrF
references = [[ref] for ref in ref_texts]
m2m_bleu = corpus_bleu(m2m_pred_texts, references)
m2m_chrf = corpus_chrf(m2m_pred_texts, references)



In [8]:
output_path = "m2m100_eval_results.txt"

with open(output_path, "w", encoding="utf-8") as f:
    f.write("M2M100 Translation Evaluation (English → Sepedi)\n")
    f.write("=" * 50 + "\n\n")

    worst_examples = []

    for i, (src, ref, pred) in enumerate(zip(src_texts, ref_texts, m2m_pred_texts)):
        similarity = difflib.SequenceMatcher(None, ref, pred).ratio()

        if similarity < 0.5:
            worst_examples.append((i+1, src, ref, pred, similarity))

        f.write(f"Example {i+1}:\n")
        f.write(f"English         : {src}\n")
        f.write(f"Reference Sepedi: {ref}\n")
        f.write(f"Predicted Sepedi: {pred}\n")
        f.write(f"Similarity Score: {similarity:.2f}\n")
        f.write("-" * 50 + "\n")

    # Write metrics
    f.write("\nEvaluation Scores\n")
    f.write("=" * 50 + "\n")
    f.write(f"BLEU score: {m2m_bleu.score:.2f}\n")
    f.write(f"chrF score: {m2m_chrf.score:.2f}\n")

    # Error Analysis
    f.write("\nError Analysis\n")
    f.write("=" * 50 + "\n")

    f.write(f"\n Worst 5 Translations (Low Similarity < 0.5)\n")
    for ex in sorted(worst_examples, key=lambda x: x[-1])[:5]:
        idx, src, ref, pred, score = ex
        f.write(f"\nExample {idx} | Similarity: {score:.2f}\n")
        f.write(f"EN  : {src}\n")
        f.write(f"REF : {ref}\n")
        f.write(f"PRED: {pred}\n")

    # Sentence length stats
    lengths = [(i+1, src, ref, pred, len(src.split()), len(ref.split())) for i, (src, ref, pred) in enumerate(zip(src_texts, ref_texts, m2m_pred_texts))]

    longest = max(lengths, key=lambda x: x[4])
    shortest = min(lengths, key=lambda x: x[4])

    f.write("\n Longest English Sentence\n")
    f.write(f"Example {longest[0]} | Length: {longest[4]} words\nEN: {longest[1]}\n")

    f.write("\n Shortest English Sentence\n")
    f.write(f"Example {shortest[0]} | Length: {shortest[4]} words\nEN: {shortest[1]}\n")


In [9]:
pred_texts = []

for text in src_texts:
    inputs = tokenizer(text, return_tensors="pt")
    generated_tokens = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids("nso_Latn")
    )
    translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
    pred_texts.append(translation)


In [10]:
import evaluate
from sacrebleu import corpus_bleu, corpus_chrf

# Wrap reference texts in list-of-lists for SacreBLEU format
references = [[ref] for ref in ref_texts]

bleu = corpus_bleu(pred_texts, references)
chrf = corpus_chrf(pred_texts, references)

print(f"BLEU score: {bleu.score:.2f}")
print(f"chrF score: {chrf.score:.2f}")


BLEU score: 11.85
chrF score: 52.06


In [11]:
# Save translations and metrics to a text file
output_path = "nllb_eval_results.txt"

with open(output_path, "w", encoding="utf-8") as f:
    f.write("NLLB Translation Evaluation (English → Sepedi)\n")
    f.write("=" * 50 + "\n\n")

    for i, (src, ref, pred) in enumerate(zip(src_texts, ref_texts, pred_texts)):
        f.write(f"Example {i+1}:\n")
        f.write(f"English        : {src}\n")
        f.write(f"Reference Sepedi: {ref}\n")
        f.write(f"Predicted Sepedi: {pred}\n")
        f.write("-" * 50 + "\n")

    # Save BLEU and chrF scores at the bottom
    f.write("\nEvaluation Scores\n")
    f.write("=" * 50 + "\n")
    f.write(f"BLEU score: {bleu.score:.2f}\n")
    f.write(f"chrF score: {chrf.score:.2f}\n")


In [12]:
import difflib

output_path = "nllb_eval_results.txt"

with open(output_path, "w", encoding="utf-8") as f:
    f.write("NLLB Translation Evaluation (English → Sepedi)\n")
    f.write("=" * 50 + "\n\n")

    worst_examples = []  # Store poor translations

    for i, (src, ref, pred) in enumerate(zip(src_texts, ref_texts, pred_texts)):
        # Calculate similarity ratio (0–1)
        similarity = difflib.SequenceMatcher(None, ref, pred).ratio()

        # Save worst examples for error analysis
        if similarity < 0.5:
            worst_examples.append((i+1, src, ref, pred, similarity))

        f.write(f"Example {i+1}:\n")
        f.write(f"English         : {src}\n")
        f.write(f"Reference Sepedi: {ref}\n")
        f.write(f"Predicted Sepedi: {pred}\n")
        f.write(f"Similarity Score: {similarity:.2f}\n")
        f.write("-" * 50 + "\n")

    # Evaluation scores
    f.write("\nEvaluation Scores\n")
    f.write("=" * 50 + "\n")
    f.write(f"BLEU score: {bleu.score:.2f}\n")
    f.write(f"chrF score: {chrf.score:.2f}\n")

    # Error Analysis
    f.write("\nError Analysis\n")
    f.write("=" * 50 + "\n")

    # Show top 5 lowest similarity examples
    f.write(f"\n Worst 5 Translations (Low Similarity < 0.5)\n")
    for ex in sorted(worst_examples, key=lambda x: x[-1])[:5]:
        idx, src, ref, pred, score = ex
        f.write(f"\nExample {idx} | Similarity: {score:.2f}\n")
        f.write(f"EN  : {src}\n")
        f.write(f"REF : {ref}\n")
        f.write(f"PRED: {pred}\n")

    # Sentence length diagnostics
    lengths = [(i+1, src, ref, pred, len(src.split()), len(ref.split())) for i, (src, ref, pred) in enumerate(zip(src_texts, ref_texts, pred_texts))]

    longest = max(lengths, key=lambda x: x[4])
    shortest = min(lengths, key=lambda x: x[4])

    f.write("\n Longest English Sentence\n")
    f.write(f"Example {longest[0]} | Length: {longest[4]} words\nEN: {longest[1]}\n")

    f.write("\n Shortest English Sentence\n")
    f.write(f"Example {shortest[0]} | Length: {shortest[4]} words\nEN: {shortest[1]}\n")
