In [1]:
import pandas as pd

In [2]:
LexRank_data_path = '../resultats/results_LexRank.csv'

df = pd.read_csv(LexRank_data_path)

In [3]:
df.head()

Unnamed: 0,Text,Reference,Generated
0,"OCTOBER TERM, 2002\n Syllabus\n EARLY, WA...",William Packer was convicted in a California s...,"On direct appeal, the State Court of Appeal re..."
1,"OCTOBER TERM, 2002\n Syllabus\n DOW CHEMI...",In 1984 Dow Chemical Co. negotiated a settleme...,"Syllabus DOW CHEMICAL CO.\nArgued February 26,..."
2,"OCTOBER TERM, 2002\n Syllabus\n SYNGENTA ...",Hurley Henson filed suit in Louisiana state co...,"Argued October 15, 2002-Decided November 5, 20..."
3,OPINION OF THE COURTRUMSFELD V. PADILLA542 U. ...,"Jose Padilla, an American citizen, was arreste...","Padilla’s motion was still pending when, on Ju..."
4,"OCTOBER TERM, 1993\n Syllabus\n CONSOLIDA...",Consolidated Rail Corporation (Conrail) employ...,The injury we contemplate when considering neg...


In [4]:
gen_txt = df['Generated']
ref_txt = df['Reference']

In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)


In [6]:
import torch

def generate_summary(model, tokenizer, text, max_input_length=1024, max_output_length=256):
    """Génère un résumé pour un texte donné."""
    inputs = tokenizer(
        text, max_length=max_input_length, truncation=True, return_tensors="pt", padding="longest"
    ).input_ids
    inputs = inputs.to(model.device)  # S'assurer que les données sont sur le bon appareil

    # Génération
    output_ids = model.generate(
        inputs, max_length=max_output_length, num_beams=5, length_penalty=2.0, early_stopping=True
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

In [7]:
from rouge_score import rouge_scorer
from bert_score import BERTScorer

def evaluate_model(model, tokenizer, texts, references, max_input_length=1024, max_output_length=256):
    """Évalue les performances du modèle sur les données de test."""
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bert_scorer_instance = BERTScorer(lang="en", rescale_with_baseline=True)

    rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}
    bert_scores_f1 = []
    summaries = []

    for text, reference in zip(texts, references):
        # Générer le résumé
        generated_summary = generate_summary(model, tokenizer, text, max_input_length, max_output_length)

        summaries.append(generated_summary)

        # Calcul des scores ROUGE
        rouge_results = rouge_scorer_instance.score(reference, generated_summary)
        rouge_scores['rouge1'].append(rouge_results['rouge1'].fmeasure)
        rouge_scores['rouge2'].append(rouge_results['rouge2'].fmeasure)
        rouge_scores['rougeL'].append(rouge_results['rougeL'].fmeasure)

        # Calcul des scores BERT
        _, _, f1 = bert_scorer_instance.score([generated_summary], [reference])
        bert_scores_f1.append(f1.mean().item())  # Extraire la moyenne des scores F1

    # Moyenne des scores
    avg_scores = {
        'rouge1': sum(rouge_scores['rouge1']) / len(rouge_scores['rouge1']),
        'rouge2': sum(rouge_scores['rouge2']) / len(rouge_scores['rouge2']),
        'rougeL': sum(rouge_scores['rougeL']) / len(rouge_scores['rougeL']),
        'bert_score': sum(bert_scores_f1) / len(bert_scores_f1),
    }

    return avg_scores, summaries

# Évaluer le modèle
results, summaries = evaluate_model(model, tokenizer, gen_txt, ref_txt)
print("Scores ROUGE et BERT :", results)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Scores ROUGE et BERT : {'rouge1': 0.39637698047377723, 'rouge2': 0.13573539695952136, 'rougeL': 0.2187488646476557, 'bert_score': 0.02475793566321954}


In [8]:
import pandas as pd

summary_df = pd.DataFrame({'Generated': gen_txt, 'Reference': ref_txt, 'Summary': summaries})
summary_df.head()

summary_df.to_csv('../resultats/results_LexRank_summaries.csv', index=False)