#  Text Summarization Using MBart-large-5

In [1]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="fr_XX")

def resumer_texte(texte):
    inputs = tokenizer(texte, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=150,
        min_length=40,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

### Model Setup and Summary Function

In [6]:
import torch

print("=" * 70)
print("mBART-50 SUMMARY MODEL EVALUATION")
print("=" * 70)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
model.eval()

# ============================================================================
# SUMMARY FUNCTION
# ============================================================================

def summarize(text, language="fr_XX", max_length=150):
    """Generate a summary of the given text."""
    tokenizer.src_lang = language

    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=1024,
        truncation=True
    ).to(device)

    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            min_length=40,
            length_penalty=2.0,
            num_beams=4,
            no_repeat_ngram_size=3,
            early_stopping=True,
            decoder_start_token_id=tokenizer.lang_code_to_id[language]
        )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


mBART-50 SUMMARY MODEL EVALUATION


### Test Data and Evaluation

In [7]:
# ============================================================================
# TEST DATA
# ============================================================================

tests = [
    {
        "language": "fr_XX",
        "name": "French",
        "text": """Le Bénin est un pays d'Afrique de l'Ouest situé entre le Togo et le Nigeria...""",
        "expected_summary": "Le Bénin est un pays d'Afrique de l'Ouest avec environ 13 millions d'habitants..."
    },
    {
        "language": "en_XX",
        "name": "English",
        "text": """Benin is a country in West Africa located between Togo and Nigeria...""",
        "expected_summary": "Benin is a West African country with 13 million people..."
    }
]

# ============================================================================
# EVALUATION
# ============================================================================

print("\n SUMMARY TESTS")
print("=" * 70)

results = []

for i, test in enumerate(tests, 1):
    print(f"\n{i}. Test in {test['name']} ({test['language']})")
    print("-" * 70)

    # Generate summary
    print("⏳ Generating summary...")
    summary = summarize(test['text'], language=test['language'], max_length=100)

    # Compute statistics
    original_words = len(test['text'].split())
    summary_words = len(summary.split())
    compression_rate = (summary_words / original_words) * 100

    # Basic quality check
    meaningful = len(summary) > 20
    no_repetition = "Le Bénin Le Bénin" not in summary and "Benin Benin" not in summary
    proper_length = 30 <= summary_words <= 120
    quality = "GOOD" if (meaningful and no_repetition and proper_length) else "ISSUE"

    # Store results
    results.append({
        "language": test['name'],
        "original_words": original_words,
        "summary_words": summary_words,
        "compression_rate": compression_rate,
        "quality": quality,
        "summary": summary
    })

    # Print summary
    print(f"Original: {original_words} words")
    print(f"Summary:  {summary_words} words ({compression_rate:.1f}%)")
    print(f"Quality:  {quality}")
    print(f"Generated summary:\n{summary}")



 SUMMARY TESTS

1. Test in French (fr_XX)
----------------------------------------------------------------------
⏳ Generating summary...
Original: 15 words
Summary:  22 words (146.7%)
Quality:  ISSUE
Generated summary:
Le Bénin est un pays d'Afrique de l'Ouest situé entre le Togo et le Nigeria... ... Lire la suite... ... le Bénin.com.au-delà...

2. Test in English (en_XX)
----------------------------------------------------------------------
⏳ Generating summary...
Original: 12 words
Summary:  17 words (141.7%)
Quality:  ISSUE
Generated summary:
Benin is a country in West Africa located between Togo and Nigeria... Read More... Read Less... Benin.com.au....Benin.gov.uk....


### Global Results and Saving

In [8]:
# ============================================================================
# GLOBAL SUMMARY
# ============================================================================

print("\n" + "=" * 70)
print("GLOBAL RESULTS")
print("=" * 70)

good_summaries = sum(1 for r in results if r['quality'] == "GOOD")
total = len(results)
score = (good_summaries / total) * 100

avg_compression = sum(r['compression_rate'] for r in results) / total

print(f"Global score: {good_summaries}/{total} ({score:.0f}%)")
print(f"Average compression rate: {avg_compression:.1f}%")

# Results per language
for r in results:
    print(f"{r['language']:10s}: {r['quality']} - {r['summary_words']} words")

# Save results to a text file
with open("evaluation_mbart_summary.txt", "w", encoding="utf-8") as f:
    f.write("mBART-50 SUMMARY MODEL EVALUATION\n")
    f.write("=" * 70 + "\n\n")
    f.write(f"Global score: {score:.0f}%\n")
    f.write(f"Average compression rate: {avg_compression:.1f}%\n\n")
    for i, r in enumerate(results, 1):
        f.write(f"\n{i}. {r['language']}\n")
        f.write(f"Quality: {r['quality']}\n")
        f.write(f"Original: {r['original_words']} words\n")
        f.write(f"Summary:  {r['summary_words']} words\n")
        f.write(f"Summary text: {r['summary']}\n")

print("Results saved in: evaluation_mbart_summary.txt")



GLOBAL RESULTS
Global score: 0/2 (0%)
Average compression rate: 144.2%
French    : ISSUE - 22 words
English   : ISSUE - 17 words
Results saved in: evaluation_mbart_summary.txt
