In [1]:
import pandas as pd

In [2]:
TEXTS_COUNT = 100

In [3]:
LexRank_data_path = '../resultats/results_LexRank.csv'
target_path_csv = '../SCOTUS_data/paragraph_target_df_dev.csv'

df = pd.read_csv(LexRank_data_path)
df_target = pd.read_csv(target_path_csv)

In [4]:
df.head()

Unnamed: 0,Text,Reference,Generated
0,"OCTOBER TERM, 2002\n Syllabus\n EARLY, WA...",William Packer was convicted in a California s...,"On direct appeal, the State Court of Appeal re..."
1,"OCTOBER TERM, 2002\n Syllabus\n DOW CHEMI...",In 1984 Dow Chemical Co. negotiated a settleme...,"Syllabus DOW CHEMICAL CO.\nArgued February 26,..."
2,"OCTOBER TERM, 2002\n Syllabus\n SYNGENTA ...",Hurley Henson filed suit in Louisiana state co...,"Argued October 15, 2002-Decided November 5, 20..."
3,OPINION OF THE COURTRUMSFELD V. PADILLA542 U. ...,"Jose Padilla, an American citizen, was arreste...","Padilla’s motion was still pending when, on Ju..."
4,"OCTOBER TERM, 1993\n Syllabus\n CONSOLIDA...",Consolidated Rail Corporation (Conrail) employ...,The injury we contemplate when considering neg...


In [None]:
gen_txt = df['Generated']
ref_txt = df['Reference']

: 

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Détection de l'appareil
device = "cuda" if torch.cuda.is_available() else "cpu"

# Charger le tokenizer et le modèle
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")  
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus").to(device)

# Fonction pour résumer les textes
def summarize_texts(texts):
    summaries = []
    for text in texts:
        # Préparer l'entrée pour le modèle
        inputs = tokenizer(text, max_length=1024, truncation=True, return_tensors="pt").to(device)
        
        # Générer le résumé
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=1024,
            min_length=40,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        
        # Décoder le résumé généré
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

# Résumer les textes dans gen_txt
summarized_texts = summarize_texts(gen_txt)

# Ajouter les résumés au DataFrame
df['Summary'] = summarized_texts

df.to_csv('./output/results_Hybrid_Lex_Legal.csv', index=False)


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)


In [7]:
summary_gen = df['Summary']

In [None]:
from rouge_score import rouge_scorer
from bert_score import BERTScorer

ROUGE_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
BERT_scorer = BERTScorer(lang="en")
ROUGE_scores = []
BERT_scores = []
for i in range(len(ref_txt)):
    score = ROUGE_scorer.score(ref_txt[i], summary_gen[i])
    ROUGE_scores.append(score)
    BERT_scores.append(BERT_scorer.score([ref_txt[i]], [summary_gen[i]]))
    # print(f"Scores pour le résumé {i+1} :", score)

# Moyennes des scores
avg_scores = {
    'rouge1': sum(s['rouge1'].fmeasure for s in ROUGE_scores) / len(ROUGE_scores),
    'rouge2': sum(s['rouge2'].fmeasure for s in ROUGE_scores) / len(ROUGE_scores),
    'rougeL': sum(s['rougeL'].fmeasure for s in ROUGE_scores) / len(ROUGE_scores),
    'bert_score': sum(s[2].mean().item() for s in BERT_scores) / len(BERT_scores),
}

print("Scores ROUGE :", avg_scores)

In [None]:
scores = {
    'facts_of_the_case': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'question': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'conclusion': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []}
}

for column_name in df_target.columns:
    for i in range(TEXTS_COUNT):
        ref = df_target[column_name].iloc[i]
        gen = summary_gen[i]

        # Scores ROUGE
        rouge_score = ROUGE_scorer.score(ref, gen)
        scores[column_name]['rouge1'].append(rouge_score['rouge1'].recall)
        scores[column_name]['rouge2'].append(rouge_score['rouge2'].recall)
        scores[column_name]['rougeL'].append(rouge_score['rougeL'].recall)

        # Scores BERT
        _, _, bert_score = BERT_scorer.score([ref], [gen])
        scores[column_name]['bert_score'].append(bert_score.mean().item())

avg_scores_target = {
    col: {
        'rouge1': sum(scores[col]['rouge1']) / len(scores[col]['rouge1']),
        'rouge2': sum(scores[col]['rouge2']) / len(scores[col]['rouge2']),
        'rougeL': sum(scores[col]['rougeL']) / len(scores[col]['rougeL']),
        'bert_score': sum(scores[col]['bert_score']) / len(scores[col]['bert_score'])
    }
    for col in df_target.columns
}

for col, metrics in avg_scores_target.items():
    print(f"\nScores moyens pour {col} :")
    print(metrics)


In [None]:
df_avg_scores = pd.DataFrame([avg_scores])
df_avg_scores.index = ['global']

df_avg_scores_target = pd.DataFrame(avg_scores_target).T

df_score = pd.concat([df_avg_scores, df_avg_scores_target], axis=0)
print(df_score.head())

In [11]:
df_score.to_csv("./output/scores_Hybrid_Lex_Legal_dev.csv")