<a href="https://colab.research.google.com/github/Safae26/text-summarization/blob/main/notebooks/gpt2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==============================================
# FINE-TUNING GPT-2 SUR CNN/DAILYMAIL (5000 exemples)
# ==============================================

print("="*60)
print("FINE-TUNING GPT-2 (124M) - COMME L'ARTICLE")
print("="*60)

# ==============================================
# 1. INSTALLATIONS
# ==============================================

!pip install transformers datasets accelerate rouge-score -q

import torch
import numpy as np
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import gc
import os

# Nettoyage m√©moire
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"

print("‚úÖ Biblioth√®ques install√©es")

# ==============================================
# 2. DATASET (5000 train, 1000 val, 1000 test)
# ==============================================

print("\n" + "="*60)
print("üìä CHARGEMENT DU DATASET")
print("="*60)

dataset = load_dataset("cnn_dailymail", "3.0.0")

# Split comme dans l'article
train_dataset = dataset["train"].select(range(5000))      # 5000 training
val_dataset = dataset["validation"].select(range(1000))   # 1000 validation
test_dataset = dataset["test"].select(range(1000))        # 1000 test

print(f"‚úÖ Dataset pr√™t:")
print(f"  Training:   {len(train_dataset)} exemples")
print(f"  Validation: {len(val_dataset)} exemples")
print(f"  Test:       {len(test_dataset)} exemples")

# ==============================================
# 3. TOKENISATION GPT-2
# ==============================================

print("\n" + "="*60)
print("üî§ TOKENISATION GPT-2")
print("="*60)

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Important pour GPT-2

def preprocess_function(examples):
    """Format pour GPT-2: [article] [separator] [summary]"""
    texts = []

    for article, highlight in zip(examples["article"], examples["highlights"]):
        # Format: article + s√©parateur + r√©sum√©
        text = f"ARTICLE: {article}\n\nSUMMARY: {highlight}"
        texts.append(text)

    # Tokeniser
    tokenized = tokenizer(
        texts,
        max_length=512,
        truncation=True,
        padding="max_length",
        return_tensors=None
    )

    # Pour GPT-2, les labels sont les m√™mes que input_ids
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

print("Tokenisation en cours...")
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=8,
    remove_columns=train_dataset.column_names,
    desc="Tokenisation training"
)

tokenized_val = val_dataset.map(
    preprocess_function,
    batched=True,
    batch_size=8,
    remove_columns=val_dataset.column_names,
    desc="Tokenisation validation"
)

print("‚úÖ Tokenisation termin√©e")

# ==============================================
# 4. MOD√àLE GPT-2 (124M param√®tres)
# ==============================================

print("\n" + "="*60)
print("üß† CHARGEMENT DE GPT-2 (124M)")
print("="*60)

model = GPT2LMHeadModel.from_pretrained("gpt2")

# Configurer pour le padding
model.config.pad_token_id = tokenizer.pad_token_id

total_params = sum(p.numel() for p in model.parameters())
print(f"‚úÖ GPT-2 charg√©")
print(f"üìä Param√®tres: {total_params/1e6:.1f}M")
print(f"üìä Device: {model.device}")

# ==============================================
# 5. CONFIGURATION DU FINE-TUNING
# ==============================================

print("\n" + "="*60)
print("‚öôÔ∏è  CONFIGURATION DU FINE-TUNING")
print("="*60)

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-5000",
    overwrite_output_dir=True,
    num_train_epochs=5,  # Comme dans l'article
    per_device_train_batch_size=4,  # GPT-2 est plus l√©ger
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Batch effectif = 16
    learning_rate=1e-5,  # Faible comme dans l'article
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir="./logs-gpt2",
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,
    report_to="none",
    dataloader_pin_memory=False,
)

print("‚úÖ Configuration d√©finie:")
print(f"  ‚Ä¢ Epochs: 5 (comme l'article)")
print(f"  ‚Ä¢ Batch size: 4")
print(f"  ‚Ä¢ Learning rate: 1e-5")

# ==============================================
# 6. FINE-TUNING GPT-2
# ==============================================

print("\n" + "="*60)
print("üî• D√âBUT DU FINE-TUNING GPT-2")
print("="*60)
print("‚ö†Ô∏è  Cette √©tape prend 1-2 heures")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

try:
    train_result = trainer.train()
    print(f"\n‚úÖ FINE-TUNING R√âUSSI !")
    print(f"‚è±Ô∏è  Temps: {train_result.metrics['train_runtime']/60:.1f} min")
    print(f"üìâ Training loss: {train_result.metrics['train_loss']:.3f}")

except Exception as e:
    print(f"\n‚ùå ERREUR: {e}")
    print("\nüîÑ Tentative avec batch_size=2...")

    # R√©essayer avec batch size plus petit
    training_args.per_device_train_batch_size = 2
    training_args.per_device_eval_batch_size = 2
    training_args.gradient_accumulation_steps = 8

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=tokenizer,
    )

    train_result = trainer.train()
    print(f"\n‚úÖ FINE-TUNING R√âUSSI avec batch_size=2")

# ==============================================
# 7. SAUVEGARDE DU MOD√àLE FINE-TUN√â
# ==============================================

print("\n" + "="*60)
print("üíæ SAUVEGARDE DU MOD√àLE GPT-2 FINE-TUN√â")
print("="*60)

model_save_path = "./gpt2_finetuned_5000"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"‚úÖ Mod√®le GPT-2 fine-tun√© sauvegard√© dans: {model_save_path}")

# ==============================================
# 8. √âVALUATION SUR TEST SET (1000 exemples)
# ==============================================

print("\n" + "="*60)
print("üìä √âVALUATION ROUGE SUR TEST SET")
print("="*60)

!pip install rouge-score -q
from rouge_score import rouge_scorer

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)

# Fonction de g√©n√©ration pour GPT-2
def generate_summary_gpt2(text):
    """G√©n√®re un r√©sum√© avec GPT-2 fine-tun√© - CORRIG√â"""
    prompt = f"ARTICLE: {text[:800]}\n\nSUMMARY:"

    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        summary_ids = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],  # ‚≠ê AJOUTER
            max_new_tokens=100,  # ‚≠ê CORRECTION ICI (au lieu de max_length)
            min_new_tokens=30,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            num_beams=2,
            early_stopping=True,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    full_output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    if "SUMMARY:" in full_output:
        return full_output.split("SUMMARY:")[-1].strip()
    return full_output

# √âvaluation sur 1000 exemples
print(f"√âvaluation sur 1000 exemples du test set...")

gpt2_rouge1 = []
gpt2_rouge2 = []
gpt2_rougeL = []
gpt2_rougeLsum = []

import time
start_time = time.time()

for i in range(1000):
    article = test_dataset[i]["article"]
    reference = test_dataset[i]["highlights"]

    generated = generate_summary_gpt2(article)
    scores = scorer.score(reference, generated)

    gpt2_rouge1.append(scores['rouge1'].fmeasure)
    gpt2_rouge2.append(scores['rouge2'].fmeasure)
    gpt2_rougeL.append(scores['rougeL'].fmeasure)
    gpt2_rougeLsum.append(scores['rougeLsum'].fmeasure)

    if (i + 1) % 100 == 0:
        progress = (i + 1) / 1000 * 100
        current_rouge1 = np.mean(gpt2_rouge1) * 100
        print(f"  {i+1}/1000 ({progress:.0f}%) - ROUGE-1: {current_rouge1:.1f}%")

eval_time = time.time() - start_time

# ==============================================
# 9. R√âSULTATS ROUGE (comme l'article)
# ==============================================

print("\n" + "="*60)
print("üìà R√âSULTATS ROUGE - GPT-2 FINE-TUN√â")
print("="*60)

gpt2_r1 = np.mean(gpt2_rouge1) * 100
gpt2_r2 = np.mean(gpt2_rouge2) * 100
gpt2_rL = np.mean(gpt2_rougeL) * 100
gpt2_rLsum = np.mean(gpt2_rougeLsum) * 100

print(f"\nüéØ TES R√âSULTATS GPT-2 (1000 exemples):")
print(f"  ROUGE-1:    {gpt2_r1:.2f}%")
print(f"  ROUGE-2:    {gpt2_r2:.2f}%")
print(f"  ROUGE-L:    {gpt2_rL:.2f}%")
print(f"  ROUGE-Lsum: {gpt2_rLsum:.2f}%")

print(f"\nüìä STATISTIQUES:")
print(f"  √âcart-type ROUGE-1: {np.std(gpt2_rouge1)*100:.2f}%")
print(f"  Temps d'√©valuation: {eval_time/60:.1f} min")

# ==============================================
# 10. COMPARAISON AVEC L'ARTICLE
# ==============================================

print("\n" + "="*60)
print("üìä COMPARAISON AVEC L'ARTICLE (Table 3)")
print("="*60)

print(f"\n{'Mod√®le':<25} {'ROUGE-1':<10} {'ROUGE-2':<10} {'ROUGE-L':<10} {'ROUGE-Lsum':<10}")
print("-" * 65)
print(f"{'Article GPT-2':<25} {24.83:<10.2f} {16.92:<10.2f} {22.14:<10.2f} {21.07:<10.2f}")
print(f"{'Ton GPT-2 (5000 ex)':<25} {gpt2_r1:<10.2f} {gpt2_r2:<10.2f} {gpt2_rL:<10.2f} {gpt2_rLsum:<10.2f}")
print("-" * 65)

difference_rouge1 = gpt2_r1 - 24.83
print(f"\nüìà Diff√©rence ROUGE-1: {difference_rouge1:+.2f}%")

if difference_rouge1 > 0:
    print("‚úÖ Ton mod√®le performe MIEUX que l'article !")
elif difference_rouge1 > -5:
    print("üëç Performance proche de l'article (normal avec moins de donn√©es)")
else:
    print("‚ö†Ô∏è  Performance inf√©rieure (normal: 5000 vs 287K exemples dans l'article)")

# ==============================================
# 11. SAUVEGARDE DES R√âSULTATS
# ==============================================

print("\n" + "="*60)
print("üíæ SAUVEGARDE DES R√âSULTATS GPT-2")
print("="*60)

import json
from datetime import datetime

# Cr√©er dossier r√©sultats
results_dir = "./gpt2_finetuned_results"
os.makedirs(results_dir, exist_ok=True)

# Sauvegarder les r√©sultats
results = {
    "model": "GPT-2 (124M) fine-tuned",
    "training": {
        "examples": 5000,
        "validation": 1000,
        "epochs": 5,
        "learning_rate": 1e-5,
        "batch_size": 4
    },
    "evaluation": {
        "test_examples": 1000,
        "rouge1": float(gpt2_r1),
        "rouge2": float(gpt2_r2),
        "rougeL": float(gpt2_rL),
        "rougeLsum": float(gpt2_rLsum),
        "std_rouge1": float(np.std(gpt2_rouge1) * 100)
    },
    "comparison_with_article": {
        "article_rouge1": 24.83,
        "article_rouge2": 16.92,
        "article_rougeL": 22.14,
        "article_rougeLsum": 21.07,
        "difference_rouge1": float(difference_rouge1)
    },
    "date": datetime.now().isoformat()
}

with open(os.path.join(results_dir, "results.json"), "w") as f:
    json.dump(results, f, indent=2)

print(f"‚úÖ R√©sultats sauvegard√©s dans: {results_dir}/results.json")

# ==============================================
# 12. COMPARAISON GPT-2 vs BART
# ==============================================

print("\n" + "="*60)
print("üìä COMPARAISON GPT-2 vs BART")
print("="*60)

# Charger r√©sultats BART si disponibles
bart_results_path = "./bart_finetuned_results/results.json"
if os.path.exists(bart_results_path):
    with open(bart_results_path, 'r') as f:
        bart_results = json.load(f)

    bart_rouge1 = bart_results["evaluation"]["rouge1"]

    print(f"\nüéØ COMPARAISON DES PERFORMANCES:")
    print(f"  ‚Ä¢ GPT-2 ROUGE-1: {gpt2_r1:.2f}%")
    print(f"  ‚Ä¢ BART ROUGE-1:  {bart_rouge1:.2f}%")
    print(f"  ‚Ä¢ Diff√©rence:    {bart_rouge1 - gpt2_r1:+.2f}%")

    if bart_rouge1 > gpt2_r1:
        print(f"\n‚úÖ BART est sup√©rieur √† GPT-2 (confirm√©)")
    else:
        print(f"\n‚ö†Ô∏è  R√©sultat inattendu")
else:
    print("‚ö†Ô∏è  R√©sultats BART non trouv√©s pour comparaison")

# ==============================================
# 13. T√âL√âCHARGEMENT
# ==============================================

print("\n" + "="*60)
print("üì¶ PR√âPARATION DU T√âL√âCHARGEMENT")
print("="*60)

import shutil

# Cr√©er ZIP avec mod√®le + r√©sultats
final_dir = "./gpt2_project_final"
os.makedirs(final_dir, exist_ok=True)

# Copier mod√®le
shutil.copytree(model_save_path, os.path.join(final_dir, "model"), dirs_exist_ok=True)
# Copier r√©sultats
shutil.copy(os.path.join(results_dir, "results.json"), os.path.join(final_dir, "results.json"))

# Cr√©er ZIP
zip_name = "gpt2_finetuned_project"
shutil.make_archive(zip_name, 'zip', final_dir)

# T√©l√©charger
from google.colab import files
files.download(f"{zip_name}.zip")

print(f"\n‚úÖ PROJET GPT-2 TERMIN√â !")
print(f"üì¶ Fichier: {zip_name}.zip")
print(f"üìä ROUGE-1: {gpt2_r1:.2f}%")
print(f"üìà Comparaison article: {difference_rouge1:+.2f}%")

FINE-TUNING GPT-2 (124M) - COMME L'ARTICLE
‚úÖ Biblioth√®ques install√©es

üìä CHARGEMENT DU DATASET
‚úÖ Dataset pr√™t:
  Training:   5000 exemples
  Validation: 1000 exemples
  Test:       1000 exemples

üî§ TOKENISATION GPT-2
Tokenisation en cours...


Tokenisation training:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenisation validation:   0%|          | 0/1000 [00:00<?, ? examples/s]

‚úÖ Tokenisation termin√©e

üß† CHARGEMENT DE GPT-2 (124M)
‚úÖ GPT-2 charg√©
üìä Param√®tres: 124.4M
üìä Device: cpu

‚öôÔ∏è  CONFIGURATION DU FINE-TUNING
‚úÖ Configuration d√©finie:
  ‚Ä¢ Epochs: 5 (comme l'article)
  ‚Ä¢ Batch size: 4
  ‚Ä¢ Learning rate: 1e-5

üî• D√âBUT DU FINE-TUNING GPT-2
‚ö†Ô∏è  Cette √©tape prend 1-2 heures


  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Step,Training Loss,Validation Loss
500,2.7864,2.756299
1000,2.7535,2.757755
1500,2.7361,2.75827


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].



‚úÖ FINE-TUNING R√âUSSI !
‚è±Ô∏è  Temps: 31.6 min
üìâ Training loss: 2.788

üíæ SAUVEGARDE DU MOD√àLE GPT-2 FINE-TUN√â
‚úÖ Mod√®le GPT-2 fine-tun√© sauvegard√© dans: ./gpt2_finetuned_5000

üìä √âVALUATION ROUGE SUR TEST SET
√âvaluation sur 1000 exemples du test set...
  100/1000 (10%) - ROUGE-1: 23.3%
  200/1000 (20%) - ROUGE-1: 23.2%
  300/1000 (30%) - ROUGE-1: 23.1%
  400/1000 (40%) - ROUGE-1: 23.4%
  500/1000 (50%) - ROUGE-1: 23.1%
  600/1000 (60%) - ROUGE-1: 23.0%
  700/1000 (70%) - ROUGE-1: 23.2%
  800/1000 (80%) - ROUGE-1: 23.1%
  900/1000 (90%) - ROUGE-1: 23.0%
  1000/1000 (100%) - ROUGE-1: 22.9%

üìà R√âSULTATS ROUGE - GPT-2 FINE-TUN√â

üéØ TES R√âSULTATS GPT-2 (1000 exemples):
  ROUGE-1:    22.92%
  ROUGE-2:    8.01%
  ROUGE-L:    17.65%
  ROUGE-Lsum: 19.84%

üìä STATISTIQUES:
  √âcart-type ROUGE-1: 11.87%
  Temps d'√©valuation: 19.7 min

üìä COMPARAISON AVEC L'ARTICLE (Table 3)

Mod√®le                    ROUGE-1    ROUGE-2    ROUGE-L    ROUGE-Lsum
-------------------

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


‚úÖ PROJET GPT-2 TERMIN√â !
üì¶ Fichier: gpt2_finetuned_project.zip
üìä ROUGE-1: 22.92%
üìà Comparaison article: -1.91%
