In [1]:
# %pip install bm25s
# %pip install spacy
# %pip install -U 'spacy[cuda12x]'
# %pip install rouge_score

In [None]:
import functions as fct
import custom_spacy as csp
import spacy
import bm25s
import numpy as np

In [3]:
train_path = 'SCOTUS/train.json'
dev_path = 'SCOTUS/dev.json'

Summarization with legal-pegasus

In [None]:
text = """On March 5, 2021, the Securities and Exchange Commission charged AT&T, Inc. with repeatedly violating Regulation FD, and three of its Investor Relations executives with aiding and abetting AT&T's violations, by selectively disclosing material nonpublic information to research analysts. According to the SEC's complaint, AT&T learned in March 2016 that a steeper-than-expected decline in its first quarter smartphone sales would cause AT&T's revenue to fall short of analysts' estimates for the quarter. The complaint alleges that to avoid falling short of the consensus revenue estimate for the third consecutive quarter, AT&T Investor Relations executives Christopher Womack, Michael Black, and Kent Evans made private, one-on-one phone calls to analysts at approximately 20 separate firms. On these calls, the AT&T executives allegedly disclosed AT&T's internal smartphone sales data and the impact of that data on internal revenue metrics, despite the fact that internal documents specifically informed Investor Relations personnel that AT&T's revenue and sales of smartphones were types of information generally considered "material" to AT&T investors, and therefore prohibited from selective disclosure under Regulation FD. The complaint further alleges that as a result of what they were told on these calls, the analysts substantially reduced their revenue forecasts, leading to the overall consensus revenue estimate falling to just below the level that AT&T ultimately reported to the public on April 26, 2016. The SEC's complaint, filed in federal district court in Manhattan, charges AT&T with violations of the disclosure provisions of Section 13(a) of the Securities Exchange Act of 1934 and Regulation FD thereunder, and charges Womack, Evans and Black with aiding and abetting these violations. The complaint seeks permanent injunctive relief and civil monetary penalties against each defendant. The SEC's investigation was conducted by George N. Stepaniuk, Thomas Peirce, and David Zetlin-Jones of the SEC's New York Regional Office. The SEC's litigation will be conducted by Alexander M. Vasilescu, Victor Suthammanont, and Mr. Zetlin-Jones. The case is being supervised by Sanjay Wadhwa."""

summary = fct.summarize(text, "legal-pegasus")

print(summary)

BM25 with custom_spacy

In [None]:
#modified spacy for segmentation of legal text
nlp = csp.custom_spacy_model()            

# Process the original text
doc = nlp(text)

# Create a corpus for BM25
corpus = [sent.text for sent in doc.sents]  

# Initialize and index BM25
retriever = bm25s.BM25(corpus=corpus)
retriever.index(bm25s.tokenize(corpus))  

# Process the summary
doc_sum = nlp(summary)

# Retrieve sentences
for sent in doc_sum.sents:
    query = sent.text
    results, scores = retriever.retrieve(bm25s.tokenize(query), k=2)
    avg_score = np.mean(scores)
    print(sent.text + " ; " + str(avg_score))


In [None]:
print(text)
print(summary)

In [None]:
print(len(text), len(summary))

In [None]:
score = fct.rouge_evaluations(text, summary)
print(score)

# BERT

In [None]:
import nltk
from transformers import BertModel, BertTokenizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from nltk.tokenize import sent_tokenize
import numpy as np
import json

In [10]:
# Charger le modèle et le tokenizer BERT pré-entraîné
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

In [11]:
# Exemple de texte juridique
# document = """
# The law guarantees the fundamental rights and freedoms of every citizen.
# The judge rendered his verdict based on the 2023 law.
# The rights of citizens must be respected in all legal proceedings.
# The 1958 constitution is the basis of the current legal system.
# Every individual has the right to a fair trial under applicable laws.
# """

with open(train_path, 'r', encoding="utf-8") as f:
    train = json.load(f)

document = train[0]["raw_source"]

In [12]:
# Tokenisation en phrases
sentences = sent_tokenize(document)

# Fonction pour obtenir les embeddings de phrases avec BERT
def get_sentence_embeddings(sentences, tokenizer, model):
    """Obtenir les embeddings de phrases avec BERT
    Args:
        sentences (List[str]): Liste des phrases à encoder
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer BERT
        model (transformers.PreTrainedModel): Modèle BERT
    Returns:
        np.array: Tableau des embeddings de phrases    
    """
    embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
        outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy())  # Moyenne des embeddings
    return np.array(embeddings)

# Obtenir les embeddings des phrases
sentence_embeddings = get_sentence_embeddings(sentences, tokenizer, model)

# K-Means

In [None]:
# Utiliser K-Means pour regrouper des phrases similaires
n_clusters = 5  # Nombre de clusters souhaités (peut être ajusté)
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans.fit(sentence_embeddings)

# Étiquettes des clusters
labels = kmeans.labels_

In [None]:
cluster = {}
for i in range(n_clusters):
    print(f"\nCluster {i+1}:")
    cluster[i] = []  # Initialiser une liste pour chaque cluster
    for j, sentence in enumerate(sentences):
        if labels[j] == i:
            print(f"- {sentence}")
            # Ajouter 'sentence' au cluster 'i'
            cluster[i].append(sentence)
    # Afficher les clés du dictionnaire 'cluster'
print(cluster.keys())

In [None]:
# Calculer le score de silhouette pour évaluer la qualité du clustering
silhouette_avg = silhouette_score(sentence_embeddings, labels)
print(f"\nSilhouette Score: {silhouette_avg}")

# BM25

In [16]:
# Installer la bibliothèque si nécessaire
# %pip install rank-bm25

In [None]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize

nltk.download('punkt')

In [18]:
# Exemple de documents (textes juridiques)
# documents = [
#     "a cat is a feline and likes to purr",
#     "a dog is the human's best friend and loves to play",
#     "a bird is a beautiful animal that can fly",
#     "a fish is a creature that lives in water and swims"
# ]

In [19]:
# Dictionnaire pour stocker les documents tokenisés par cluster
tokenized_clusters = {}

# Tokenisation des documents pour chaque cluster
for i, sentences in cluster.items():
    tokenized_clusters[i] = [word_tokenize(sentence.lower()) for sentence in sentences]

# Initialiser un modèle BM25 pour chaque cluster
bm25_models = {}
for i, tokenized_docs in tokenized_clusters.items():
    bm25_models[i] = BM25Okapi(tokenized_docs)

In [20]:
# Exemple de requête
query = "law and legal rights"
tokenized_query = word_tokenize(query.lower())

In [None]:
# Calcul des scores BM25 pour tous les clusters
for cluster_id, bm25 in bm25_models.items():
    # Calcul des scores pour la requête dans chaque cluster
    scores = bm25.get_scores(tokenized_query)
    print(f"Scores pour la requête dans le cluster {cluster_id}: {scores}")

In [None]:
for cluster_id, bm25 in bm25_models.items():
    # Récupérer les documents les plus pertinents pour la requête
    top_docs = bm25.get_top_n(tokenized_query, tokenized_clusters[cluster_id], n=1) # n=1 peut être ajusté pour obtenir plus de documents
    print(f"\nCluster {cluster_id}:")
    for doc in top_docs:
        sentence = ' '.join(doc)
        print(sentence)