In [8]:
!pip install pandas tqdm transformers torch





In [9]:
import pandas as pd
import time
from tqdm import tqdm

In [10]:
TEXTS_COUNT = 100

In [11]:
LexRank_data_path = '../evaluate models/output/results_Luhn_dev.csv'
target_path_csv = '../SCOTUS_data/paragraph_target_df_dev.csv'

df = pd.read_csv(LexRank_data_path)
df_target = pd.read_csv(target_path_csv)

In [12]:
df.head()

Unnamed: 0,Text,Reference,Generated
0,"OCTOBER TERM, 2002\n Syllabus\n EARLY, WA...",Acting on a tip from a confidential informant ...,"On direct appeal, the State Court of Appeal re..."
1,"OCTOBER TERM, 2002\n Syllabus\n DOW CHEMI...",Robert Smith was convicted of first-degree mur...,Syllabus DOW CHEMICAL CO.\nET AL. v. STEPHEN...
2,"OCTOBER TERM, 2002\n Syllabus\n SYNGENTA ...","In 1984, Congress enacted legislation ordering...","Argued October 15, 2002-Decided November 5, 20..."
3,OPINION OF THE COURTRUMSFELD V. PADILLA542 U. ...,Stoneridge Investment Partners alleged that th...,"DONALD H. RUMSFELD, SECRETARY OF DEFENSE, PETI..."
4,"OCTOBER TERM, 1993\n Syllabus\n CONSOLIDA...",Consolidated Rail Corporation (Conrail) employ...,"Finally, the common law does not support the c..."


In [13]:
gen_txt = df['Generated']
ref_txt = df['Reference']

In [14]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Détection de l'appareil
device = "cuda" if torch.cuda.is_available() else "cpu"

# Charger le tokenizer et le modèle
tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")  
model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus").to(device)

# Fonction pour résumer les textes
def summarize_texts(texts):
    summaries = []
    for text in texts:
        # Préparer l'entrée pour le modèle
        inputs = tokenizer(text, max_length=1024, truncation=True, return_tensors="pt").to(device)
        
        # Générer le résumé
        summary_ids = model.generate(
            inputs["input_ids"],
            max_length=1024,
            min_length=40,
            length_penalty=2.0,
            num_beams=4,
            early_stopping=True
        )
        
        # Décoder le résumé généré
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

start_time = time.time()

# Résumer les textes dans gen_txt
summarized_texts = summarize_texts(gen_txt)
end_time = time.time()
execution_time = end_time - start_time

# Ajouter les résumés au DataFrame
df['Summary'] = summarized_texts

df.to_csv('./output/results_Hybrid_Luhn_Legal_dev.csv', index=False)


ImportError: cannot import name 'logging' from 'huggingface_hub' (c:\Users\Nicolas\anaconda3\Lib\site-packages\huggingface_hub\__init__.py)

In [8]:
summary_gen = df['Summary']

In [9]:
from rouge_score import rouge_scorer
from bert_score import BERTScorer

ROUGE_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
BERT_scorer = BERTScorer(lang="en")
ROUGE_scores = []
BERT_scores = []
for i in range(len(ref_txt)):
    score = ROUGE_scorer.score(ref_txt[i], summary_gen[i])
    ROUGE_scores.append(score)
    BERT_scores.append(BERT_scorer.score([ref_txt[i]], [summary_gen[i]]))
    # print(f"Scores pour le résumé {i+1} :", score)

# Moyennes des scores
avg_scores = {
    'rouge1': sum(s['rouge1'].recall for s in ROUGE_scores) / len(ROUGE_scores),
    'rouge2': sum(s['rouge2'].recall for s in ROUGE_scores) / len(ROUGE_scores),
    'rougeL': sum(s['rougeL'].recall for s in ROUGE_scores) / len(ROUGE_scores),
    'bert_score': sum(s[2].mean().item() for s in BERT_scores) / len(BERT_scores),
}

print("Scores ROUGE :", avg_scores)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Scores ROUGE : {'rouge1': 0.36536931894440317, 'rouge2': 0.1199662001046805, 'rougeL': 0.22142622385237762, 'bert_score': 0.8255935955047607}


In [10]:
scores = {
    'facts_of_the_case': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'question': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []},
    'conclusion': {'rouge1': [], 'rouge2': [], 'rougeL': [], 'bert_score': []}
}

for column_name in df_target.columns:
    for i in range(TEXTS_COUNT):
        ref = df_target[column_name].iloc[i]
        gen = summary_gen[i]

        # Scores ROUGE
        rouge_score = ROUGE_scorer.score(ref, gen)
        scores[column_name]['rouge1'].append(rouge_score['rouge1'].recall)
        scores[column_name]['rouge2'].append(rouge_score['rouge2'].recall)
        scores[column_name]['rougeL'].append(rouge_score['rougeL'].recall)

        # Scores BERT
        _, _, bert_score = BERT_scorer.score([ref], [gen])
        scores[column_name]['bert_score'].append(bert_score.mean().item())

avg_scores_target = {
    col: {
        'rouge1': sum(scores[col]['rouge1']) / len(scores[col]['rouge1']),
        'rouge2': sum(scores[col]['rouge2']) / len(scores[col]['rouge2']),
        'rougeL': sum(scores[col]['rougeL']) / len(scores[col]['rougeL']),
        'bert_score': sum(scores[col]['bert_score']) / len(scores[col]['bert_score'])
    }
    for col in df_target.columns
}

for col, metrics in avg_scores_target.items():
    print(f"\nScores moyens pour {col} :")
    print(metrics)



Scores moyens pour facts_of_the_case :
{'rouge1': 0.306481537977766, 'rouge2': 0.04488918383936519, 'rougeL': 0.21236966189969575, 'bert_score': 0.7978615707159042}

Scores moyens pour question :
{'rouge1': 0.34774350403739335, 'rouge2': 0.05005823144621627, 'rougeL': 0.29334381862027237, 'bert_score': 0.7922924607992172}

Scores moyens pour conclusion :
{'rouge1': 0.31080339758491965, 'rouge2': 0.05310137414274551, 'rougeL': 0.21834438949550697, 'bert_score': 0.7979495960474015}


In [11]:
df_avg_scores = pd.DataFrame([avg_scores])
df_avg_scores.index = ['global']

df_avg_scores_target = pd.DataFrame(avg_scores_target).T

df_score = pd.concat([df_avg_scores, df_avg_scores_target], axis=0)
df_score['Execution time'] = execution_time

print(df_score.head())

                     rouge1    rouge2    rougeL  bert_score  Execution time
global             0.365369  0.119966  0.221426    0.825594      2004.62402
facts_of_the_case  0.306482  0.044889  0.212370    0.797862      2004.62402
question           0.347744  0.050058  0.293344    0.792292      2004.62402
conclusion         0.310803  0.053101  0.218344    0.797950      2004.62402


In [12]:
df_score.to_csv("./output/scores_Hybrid_Luhn_Legal_dev.csv")