In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=c922863668a89f59d7cf409711dbfbe1fd83c286b6dfb42168dbbf61dd7b3b71
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [3]:
model_path = "/content/drive/MyDrive/legal-pegasus-model-Scopus"

test_path = '/content/drive/MyDrive/SCOTU_data_txt_save'

test_path_txt = test_path + '/text_dev'
test_path_summary = test_path + '/summary_dev'

In [4]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Charger le modèle et le tokenizer
model = PegasusForConditionalGeneration.from_pretrained(model_path)
tokenizer = PegasusTokenizer.from_pretrained(model_path)

In [5]:
import os

def load_test_data(txt_path, summary_path):
    """Charge les fichiers texte et résumés pour le jeu de test."""
    texts = []
    summaries = []

    for file_name in os.listdir(txt_path):
        with open(os.path.join(txt_path, file_name), 'r', encoding='utf-8') as f:
            texts.append(f.read())

    for file_name in os.listdir(summary_path):
        with open(os.path.join(summary_path, file_name), 'r', encoding='utf-8') as f:
            summaries.append(f.read())

    return texts, summaries

# Charger les données
test_path_txt = test_path + '/text_dev'
test_path_summary = test_path + '/summary_dev'

texts, summaries = load_test_data(test_path_txt, test_path_summary)

# Vérifier un exemple
# print("Texte original :", texts[0])
# print("Résumé attendu :", summaries[0])

In [6]:
import torch

def generate_summary(model, tokenizer, text, max_input_length=1024, max_output_length=256):
    """Génère un résumé pour un texte donné."""
    inputs = tokenizer(
        text, max_length=max_input_length, truncation=True, return_tensors="pt", padding="longest"
    ).input_ids
    inputs = inputs.to(model.device)  # S'assurer que les données sont sur le bon appareil

    # Génération
    output_ids = model.generate(
        inputs, max_length=max_output_length, num_beams=5, length_penalty=2.0, early_stopping=True
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Générer un résumé pour un exemple
model.eval()
model.to("cuda" if torch.cuda.is_available() else "cpu")

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_no

In [None]:
from rouge_score import rouge_scorer
from tqdm import tqdm

def evaluate_model(model, tokenizer, texts, references, max_input_length=1024, max_output_length=256):
    """Évalue les performances du modèle sur les données de test."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = []

    for text, reference in zip(texts, references):
        generated_summary = generate_summary(model, tokenizer, text, max_input_length, max_output_length)
        score = scorer.score(reference, generated_summary)
        scores.append(score)

    # Moyennes des scores
    avg_scores = {
        'rouge1': sum(s['rouge1'].fmeasure for s in scores) / len(scores),
        'rouge2': sum(s['rouge2'].fmeasure for s in scores) / len(scores),
        'rougeL': sum(s['rougeL'].fmeasure for s in scores) / len(scores),
    }
    return avg_scores

# Évaluer le modèle
results = evaluate_model(model, tokenizer, texts, summaries)
print("Scores ROUGE :", results)

In [None]:
import pandas as pd

df_results = pd.DataFrame({"Text": texts, "Reference": summaries, "Generated": [generate_summary(model, tokenizer, t) for t in texts]})
df_results.to_csv("/content/drive/MyDrive/results_legal-pegasus-SCOTUS.csv", index=False)

In [None]:
# example_generated_summary = generate_summary(model, tokenizer, texts[0])
# print("Résumé généré :", example_generated_summary)