In [1]:
# %pip install bm25s
# %pip install spacy
# %pip install -U 'spacy[cuda12x]'
# %pip install rouge_score
# %pip install pysbd

In [None]:
import functions as fct
import bm25s
import numpy as np
import json

In [3]:
train_path = 'SCOTUS/train.json'
dev_path = 'SCOTUS/dev.json'

Summarization with legal-pegasus

In [None]:
text = """On March 5, 2021, the Securities and Exchange Commission charged AT&T, Inc. with repeatedly violating Regulation FD, and three of its Investor Relations executives with aiding and abetting AT&T's violations, by selectively disclosing material nonpublic information to research analysts. According to the SEC's complaint, AT&T learned in March 2016 that a steeper-than-expected decline in its first quarter smartphone sales would cause AT&T's revenue to fall short of analysts' estimates for the quarter. The complaint alleges that to avoid falling short of the consensus revenue estimate for the third consecutive quarter, AT&T Investor Relations executives Christopher Womack, Michael Black, and Kent Evans made private, one-on-one phone calls to analysts at approximately 20 separate firms. On these calls, the AT&T executives allegedly disclosed AT&T's internal smartphone sales data and the impact of that data on internal revenue metrics, despite the fact that internal documents specifically informed Investor Relations personnel that AT&T's revenue and sales of smartphones were types of information generally considered "material" to AT&T investors, and therefore prohibited from selective disclosure under Regulation FD. The complaint further alleges that as a result of what they were told on these calls, the analysts substantially reduced their revenue forecasts, leading to the overall consensus revenue estimate falling to just below the level that AT&T ultimately reported to the public on April 26, 2016. The SEC's complaint, filed in federal district court in Manhattan, charges AT&T with violations of the disclosure provisions of Section 13(a) of the Securities Exchange Act of 1934 and Regulation FD thereunder, and charges Womack, Evans and Black with aiding and abetting these violations. The complaint seeks permanent injunctive relief and civil monetary penalties against each defendant. The SEC's investigation was conducted by George N. Stepaniuk, Thomas Peirce, and David Zetlin-Jones of the SEC's New York Regional Office. The SEC's litigation will be conducted by Alexander M. Vasilescu, Victor Suthammanont, and Mr. Zetlin-Jones. The case is being supervised by Sanjay Wadhwa."""

summary = fct.summarize(text, "legal-pegasus")

print(summary)

# BM25

In [None]:
# Définition des chemins vers les fichiers JSON d'entraînement et de développement
train_path = 'SCOTUS/train.json'
dev_path = 'SCOTUS/dev.json'

# Ouverture du fichier d'entraînement
train = fct.open_file(train_path, "json")

# Récupération du document source du premier élément de l'ensemble d'entraînement
# document = train[0]["raw_source"]
document = train

# Récupération des éléments de la cible (faits, question, conclusion)
paragraph_target = (
    train[0]['raw_target']['facts_of_the_case'] +
    train[0]['raw_target']['question'] +
    train[0]['raw_target']['conclusion']
)

train_path = 'data_txt_save/train_0.txt'
train = fct.open_file(train_path, "txt")

document = train


# Segmentation des phrases du document source
sentences = fct.sent_segmentation(document, method='custom_spacy')

# Résumé des phrases en utilisant le modèle BERT
summary = fct.bb25LegalSum(sentences, "bert-base-uncased", 5)

# Évaluation de la qualité du résumé à l'aide de la métrique ROUGE
bb25_rouge = fct.rouge_evaluations(" ".join(summary), paragraph_target)

In [None]:
bb25_rouge

In [None]:
# Définition des chemins vers les fichiers JSON d'entraînement et de développement
train_path_json = 'SCOTUS/train.json'
dev_path_json = 'SCOTUS/dev.json'

# Ouverture du fichier d'entraînement JSON
train_json = fct.open_file(train_path_json, "json")

# Récupération du document source et des éléments de la cible (faits, question, conclusion)
document_json = train_json[0]["raw_source"]
paragraph_target_json = (
    train_json[0]['raw_target']['facts_of_the_case'] +
    train_json[0]['raw_target']['question'] +
    train_json[0]['raw_target']['conclusion']
)

# Définir le chemin du fichier texte à traiter
train_path_txt = 'data_txt_save/train_0.txt'

# Ouverture du fichier d'entraînement TXT
document_txt = fct.open_file(train_path_txt, "txt")

# Liste des méthodes de segmentation et des modèles à tester
methods = ['nltk', 'spacy', 'pySBD']
model = "bert-base-uncased"  # Modèle de résumé à utiliser
results = {}  # Dictionnaire pour stocker les résultats

# Fonction pour évaluer les modèles
def evaluate_models(document, paragraph_target, file_type):
    for method in methods:
        # Segmentation des phrases du document source
        sentences = fct.sent_segmentation(document, method=method)
        
        # Résumé des phrases
        summary = fct.bb25LegalSum(sentences, model, 5)
        
        # Évaluation de la qualité du résumé à l'aide de la métrique ROUGE
        bb25_rouge = fct.rouge_evaluations(" ".join(summary), paragraph_target)
        
        # Stockage des résultats dans le dictionnaire
        results[f"{file_type} - {method}"] = bb25_rouge

# Évaluation des modèles pour le fichier JSON
evaluate_models(document_json, paragraph_target_json, "JSON")

# Évaluation des modèles pour le fichier TXT
evaluate_models(document_txt, paragraph_target_json, "TXT")  # Utilise le même paragraphe cible

# Affichage des résultats de comparaison
for method, score in results.items():
    print(f"Méthode: {method}, Score ROUGE: {score}")

In [16]:
import pandas as pd

# Exemple de données de score ROUGE pour chaque méthode
results = {
    "JSON - nltk": {
        "Metric": ["rouge1", "rouge2", "rougeL"],
        "Precision": [0.449275, 0.109091, 0.184783],
        "Recall": [0.038871, 0.009407, 0.015987],
        "F1-Score": [0.071552, 0.017321, 0.029429],
    },
    "JSON - spacy": {
        "Metric": ["rouge1", "rouge2", "rougeL"],
        "Precision": [0.094203, 0.025455, 0.076087],
        "Recall": [0.292135, 0.079545, 0.235955],
        "F1-Score": [0.142466, 0.038567, 0.115068],
    },
    "JSON - pySBD": {
        "Metric": ["rouge1", "rouge2", "rougeL"],
        "Precision": [0.086957, 0.010909, 0.050725],
        "Recall": [0.279070, 0.035294, 0.162791],
        "F1-Score": [0.132597, 0.016667, 0.077348],
    },
    "TXT - nltk": {
        "Metric": ["rouge1", "rouge2", "rougeL"],
        "Precision": [0.257246, 0.080000, 0.126812],
        "Recall": [0.467105, 0.145695, 0.230263],
        "F1-Score": [0.331776, 0.103286, 0.163551],
    },
    "TXT - spacy": {
        "Metric": ["rouge1", "rouge2", "rougeL"],
        "Precision": [0.271739, 0.090909, 0.126812],
        "Recall": [0.563910, 0.189394, 0.263158],
        "F1-Score": [0.366748, 0.122850, 0.171149],
    },
    "TXT - pySBD": {
        "Metric": ["rouge1", "rouge2", "rougeL"],
        "Precision": [0.474638, 0.167273, 0.202899],
        "Recall": [0.587444, 0.207207, 0.251121],
        "F1-Score": [0.525050, 0.185111, 0.224449],
    },
}

# Créer un DataFrame pour chaque méthode et les concaténer
dfs = []
for key, value in results.items():
    df = pd.DataFrame(value)
    df['Method'] = key
    dfs.append(df)

# Concaténer tous les DataFrames
final_df = pd.concat(dfs)

# Réorganiser les colonnes pour une meilleure lisibilité
final_df = final_df[['Method', 'Metric', 'Precision', 'Recall', 'F1-Score']]

# Affichage des résultats
print(final_df)

# Optionnel: Enregistrer le résultat dans un fichier CSV
final_df.to_csv('comparaison_models.csv', index=False)


         Method  Metric  Precision    Recall  F1-Score
0   JSON - nltk  rouge1   0.449275  0.038871  0.071552
1   JSON - nltk  rouge2   0.109091  0.009407  0.017321
2   JSON - nltk  rougeL   0.184783  0.015987  0.029429
0  JSON - spacy  rouge1   0.094203  0.292135  0.142466
1  JSON - spacy  rouge2   0.025455  0.079545  0.038567
2  JSON - spacy  rougeL   0.076087  0.235955  0.115068
0  JSON - pySBD  rouge1   0.086957  0.279070  0.132597
1  JSON - pySBD  rouge2   0.010909  0.035294  0.016667
2  JSON - pySBD  rougeL   0.050725  0.162791  0.077348
0    TXT - nltk  rouge1   0.257246  0.467105  0.331776
1    TXT - nltk  rouge2   0.080000  0.145695  0.103286
2    TXT - nltk  rougeL   0.126812  0.230263  0.163551
0   TXT - spacy  rouge1   0.271739  0.563910  0.366748
1   TXT - spacy  rouge2   0.090909  0.189394  0.122850
2   TXT - spacy  rougeL   0.126812  0.263158  0.171149
0   TXT - pySBD  rouge1   0.474638  0.587444  0.525050
1   TXT - pySBD  rouge2   0.167273  0.207207  0.185111
2   TXT - 