In [1]:
import pandas as pd
import time
from tqdm import tqdm

In [2]:
TEXTS_COUNT = 100

In [3]:
LexRank_data_path = '../evaluate models/output/results_BM25_dev.csv'
target_path_csv = '../SCOTUS_data/paragraph_target_df_dev.csv'

df = pd.read_csv(LexRank_data_path)
df_target = pd.read_csv(target_path_csv)

In [4]:
df.head()

Unnamed: 0,Text,Reference,Generated
0,"OCTOBER TERM, 1998\n Per Curiam \n MARYLA...",Acting on a tip from a confidential informant ...,['After receiving a tip from a reliable inform...
1,"OCTOBER TERM, 2001\n Syllabus\n STEWART, ...",Robert Smith was convicted of first-degree mur...,"['Respondent filed a federal habeas petition, ..."
2,"U.S. Supreme Court South Dakota v. Dole, 483 U...","In 1984, Congress enacted legislation ordering...","['See 23 U.S.C. § 158 (1982 ed., Supp. III). ..."
3,OPINION OF THE COURTSTONERIDGE INVESTMENT PART...,Stoneridge Investment Partners alleged that th...,['We consider the reach of the private right o...
4,"OCTOBER TERM, 1999\n Syllabus\n FISCHER ...","Jeffrey Fischer, while president and part owne...","[""Petitioner, while president and part owner o..."


In [5]:
gen_txt = df['Generated']
ref_txt = df['Reference']

In [6]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Détection de l'appareil
device = "cuda" if torch.cuda.is_available() else "cpu"

# Charger le tokenizer et le modèle
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")  
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus").to(device)

# Fonction pour résumer les textes
def summarize_texts(texts):
    summaries = []
    for text in texts:
        # Préparer l'entrée pour le modèle
        inputs = tokenizer(text, max_length=1024, truncation=True, return_tensors="pt").to(device)
        
        # Générer le résumé
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=1024,
            min_length=40,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        
        # Décoder le résumé généré
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

start_time = time.time()

# Résumer les textes dans gen_txt
summarized_texts = summarize_texts(gen_txt)
end_time = time.time()
execution_time = end_time - start_time

# Ajouter les résumés au DataFrame
df['Summary'] = summarized_texts

df.to_csv('./output/results_Hybrid_BM25_Legal_dev.csv', index=False)


  return torch.load(checkpoint_file, map_location="cpu")


In [7]:
summary_gen = df['Summary']

In [8]:
from rouge_score import rouge_scorer
from bert_score import BERTScorer

ROUGE_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
BERT_scorer = BERTScorer(lang="en")
ROUGE_scores = []
BERT_scores = []
for i in range(len(ref_txt)):
    score = ROUGE_scorer.score(ref_txt[i], summary_gen[i])
    ROUGE_scores.append(score)
    BERT_scores.append(BERT_scorer.score([ref_txt[i]], [summary_gen[i]]))
    # print(f"Scores pour le résumé {i+1} :", score)

# Moyennes des scores
avg_scores = {
    'rouge1': sum(s['rouge1'].recall for s in ROUGE_scores) / len(ROUGE_scores),
    'rouge2': sum(s['rouge2'].recall for s in ROUGE_scores) / len(ROUGE_scores),
    'rougeL': sum(s['rougeL'].recall for s in ROUGE_scores) / len(ROUGE_scores),
    'bert_score': sum(s[2].mean().item() for s in BERT_scores) / len(BERT_scores),
}

print("Scores ROUGE :", avg_scores)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Scores ROUGE : {'rouge1': 0.32288991313283205, 'rouge2': 0.09198303422240128, 'rougeL': 0.20287965618741174, 'bert_score': 0.8188380318880081}


In [9]:
scores = {
    'facts_of_the_case': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'question': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'conclusion': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []}
}

for column_name in df_target.columns:
    for i in range(TEXTS_COUNT):
        ref = df_target[column_name].iloc[i]
        gen = summary_gen[i]

        # Scores ROUGE
        rouge_score = ROUGE_scorer.score(ref, gen)
        scores[column_name]['rouge1'].append(rouge_score['rouge1'].recall)
        scores[column_name]['rouge2'].append(rouge_score['rouge2'].recall)
        scores[column_name]['rougeL'].append(rouge_score['rougeL'].recall)

        # Scores BERT
        _, _, bert_score = BERT_scorer.score([ref], [gen])
        scores[column_name]['bert_score'].append(bert_score.mean().item())

avg_scores_target = {
    col: {
        'rouge1': sum(scores[col]['rouge1']) / len(scores[col]['rouge1']),
        'rouge2': sum(scores[col]['rouge2']) / len(scores[col]['rouge2']),
        'rougeL': sum(scores[col]['rougeL']) / len(scores[col]['rougeL']),
        'bert_score': sum(scores[col]['bert_score']) / len(scores[col]['bert_score'])
    }
    for col in df_target.columns
}

for col, metrics in avg_scores_target.items():
    print(f"\nScores moyens pour {col} :")
    print(metrics)



Scores moyens pour facts_of_the_case :
{'rouge1': 0.2883019004743229, 'rouge2': 0.04283148574839096, 'rougeL': 0.21268098952422407, 'bert_score': 0.7952366030216217}

Scores moyens pour question :
{'rouge1': 0.3217715891273004, 'rouge2': 0.03825229872340871, 'rougeL': 0.27954366136441877, 'bert_score': 0.786566653251648}

Scores moyens pour conclusion :
{'rouge1': 0.283553706160443, 'rouge2': 0.04208856968236058, 'rougeL': 0.21166240584729246, 'bert_score': 0.7908708065748214}


In [10]:
df_avg_scores = pd.DataFrame([avg_scores])
df_avg_scores.index = ['global']

df_avg_scores_target = pd.DataFrame(avg_scores_target).T

df_score = pd.concat([df_avg_scores, df_avg_scores_target], axis=0)
df_score['Execution time'] = execution_time

print(df_score.head())

                     rouge1    rouge2    rougeL  bert_score  Execution time
global             0.322890  0.091983  0.202880    0.818838     7306.407738
facts_of_the_case  0.288302  0.042831  0.212681    0.795237     7306.407738
question           0.321772  0.038252  0.279544    0.786567     7306.407738
conclusion         0.283554  0.042089  0.211662    0.790871     7306.407738


In [11]:
df_score.to_csv("./output/scores_Hybrid_BM25_Legal_dev.csv")