In [14]:
!pip install pandas transformers torch nltk spacy pysbd scikit-learn rank-bm25 rouge-score numpy json5 rouge bert_score



In [15]:
import pandas as pd
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import torch
import os
from nltk.tokenize import sent_tokenize
import spacy
import pysbd
from transformers import BertModel, BertTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from rouge_score import rouge_scorer
import pandas as pd
import numpy as np
import json
import re
from rouge import Rouge
import concurrent.futures


In [16]:
train_path = 'dataset_legal-pegasus/dataset/UK-Abs/train-data'
test_path = 'dataset_legal-pegasus/dataset/UK-Abs/test-data'

train_path_txt = train_path + '/judgement'
train_path_summary = train_path + '/summary'
test_path_txt = test_path + '/judgement'
test_path_summary = test_path + '/summary'

In [17]:
# Parcourir les fichiers dans le dossier de train et de test
train_files = os.listdir(train_path_txt)
test_files = os.listdir(test_path_txt)

# Charger le modèle Legal-Pegasus et le tokenizer
model_name = "nsi319/legal-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)

In [18]:
def open_file(file_path, type):

    with open(file_path, 'r', encoding="utf-8") as f:
        if (type == "json"):
            return json.load(f)
        elif (type == "txt"):
            return f.read()

def sent_segmentation(document, method='nltk'):
    """Segmentation of the document as sentences using the specified method.

    Args:
        document (str): The document to segment.
        method (str): The method to use for segmentation ('nltk', 'spacy', 'custom_spacy' or 'pySBD').

    Returns:
        List[str]: A list of tokenized sentences.
    """
    if method == 'nltk':
        return sent_tokenize(document)
    elif method == 'spacy':
        nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
        nlp.add_pipe("sentencizer")
        split_doc = split(document)
        sentences = []
        for chunk in split_doc:
            chunk = re.sub(r'\"', '', chunk)    # remove double quote because error
            doc = nlp(chunk)
            for sent in doc.sents:
                sentences.append(sent.text)
        return sentences
    elif method == 'custom_spacy':
        nlp = csp.custom_spacy_model()
        split_doc = split(document)
        sentences = []
        for chunk in split_doc:
            chunk = re.sub(r'\"', '', chunk)    # remove double quote because error
            doc = nlp(chunk)
            for sent in doc.sents:
                sentences.append(sent.text)
        return sentences
    elif method == 'pySBD':
        seg = pysbd.Segmenter(language="en", clean=False)
        return seg.segment(document)
    else:
        raise ValueError("Unsupported tokenization method. Choose 'nltk', 'spacy', or 'custom_spacy'.")

def summarize(text, model_name="legal-pegasus", min_length=150, max_length=250):
    """Return a summary"""

    if (model_name == "legal-pegasus"):
        tokenizer = AutoTokenizer.from_pretrained("nsi319/legal-pegasus")
        model = AutoModelForSeq2SeqLM.from_pretrained("nsi319/legal-pegasus")
        input_tokenized = tokenizer.encode(text, return_tensors='pt', max_length=1024, truncation=True)

        summary_ids = model.generate(input_tokenized,
                                    num_beams=9,
                                    no_repeat_ngram_size=3,
                                    length_penalty=2.0,
                                    min_length=min_length,
                                    max_length=max_length,
                                    early_stopping=True)

        return [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]
    else:
        return "Model not available"

def bb25LegalSum(sentences, model_name="bert-base-uncased", n_clusters = 5):
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)
    sentence_embeddings = get_sentence_embeddings(sentences, tokenizer, model)

    kmeans = KMeans(n_clusters=n_clusters, random_state=0)
    kmeans.fit(sentence_embeddings)

    # Étiquettes des clusters
    labels = kmeans.labels_

    cluster = {}
    for i in range(n_clusters):
        #print(f"\nCluster {i+1}:")
        cluster[i] = []
        for j, sentence in enumerate(sentences):
            if labels[j] == i:
                #print(f"- {sentence}")
                cluster[i].append(sentence)

    silhouette_avg = silhouette_score(sentence_embeddings, labels)
    #print(f"\nSilhouette Score: {silhouette_avg}")

    tokenized_clusters = {}

    # Tokenisation des documents pour chaque cluster
    for i, sentences in cluster.items():
        tokenized_clusters[i] = [word_tokenize(sentence.lower()) for sentence in sentences]

    # Initialiser un modèle BM25 pour chaque cluster
    bm25_models = {}
    for i, tokenized_docs in tokenized_clusters.items():
        bm25_models[i] = BM25Okapi(tokenized_docs)

    query = "law and legal rights"
    tokenized_query = word_tokenize(query.lower())

    for cluster_id, bm25 in bm25_models.items():
        # Calcul des scores pour la requête dans chaque cluster
        scores = bm25.get_scores(tokenized_query)


    best_sentences = []

    for cluster_id, bm25 in bm25_models.items():
            # Récupérer les phrases les plus pertinentes pour la requête dans ce cluster
            top_docs = bm25.get_top_n(tokenized_query, tokenized_clusters[cluster_id], n=2)

            # Extraire les phrases pertinentes et les ajouter à la liste
            for doc in top_docs:
                sentence = ' '.join(doc)  # Convertir le tokenized doc en phrase
                best_sentences.append(sentence)

    return best_sentences



def get_sentence_embeddings(sentences, tokenizer, model):
    """Obtenir les embeddings de phrases avec BERT
    Args:
        sentences (List[str]): Liste des phrases à encoder
        tokenizer (transformers.PreTrainedTokenizer): Tokenizer BERT
        model (transformers.PreTrainedModel): Modèle BERT
    Returns:
        np.array: Tableau des embeddings de phrases
    """
    embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
        outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy())  # Moyenne des embeddings
    return np.array(embeddings)

def split(text, max_length=3530):
    split_text = text.split('\n')
    result = []

    for chunk in split_text:
        while len(chunk) > max_length:
            sub_chunk = chunk[:max_length]
            last_period_position = sub_chunk.rfind('.')

            if last_period_position == -1:
                last_period_position = max_length

            if chunk[:last_period_position+1].strip():
                result.append(chunk[:last_period_position+1].strip())
            chunk = chunk[last_period_position+1:].strip()

        if chunk and chunk.strip():
            result.append(chunk.strip())

    return result


def evaluation(text, ref):
    rouges = rouge_evaluations(text, ref)

def rouge_evaluations(text, ref):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(text, ref)

    return rouge_to_df(scores)

def rouge_to_df(scores):

    data = {
        'Metric': [],
        'Precision': [],
        'Recall': [],
        'F1-Score': []
    }

    for metric, score in scores.items():
        data['Metric'].append(metric)
        data['Precision'].append(score.precision)
        data['Recall'].append(score.recall)
        data['F1-Score'].append(score.fmeasure)

    return pd.DataFrame(data)

In [19]:
# Fonction pour diviser le document en segments
def chunk_text(text, chunk_size=1024):
    inputs = tokenizer(text, return_tensors="pt", max_length=chunk_size, truncation=True, padding=True)
    chunks = []
    current_chunk = []
    current_length = 0

    for i, word in enumerate(text.split()):
        if current_length + len(word) <= chunk_size:
            current_chunk.append(word)
            current_length += len(word) + 1
        else:
            chunks.append(" ".join(current_chunk))
            current_chunk = [word]
            current_length = len(word) + 1

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [20]:
# reference_summary = open_file(train_path_txt + '/' + train_files[0], 'txt')
# print(len(reference_summary))

# # Générer le résumé
# chunks = chunk_text(reference_summary)
# print(len(chunks))

# generated_summary = [summarize(chunk, "legal-pegasus", max_length=1000) for chunk in chunks[:2]]
# # generated_summary = summarize(chunks[0], "legal-pegasus", max_length=1000)

# print("flag 2")

# # print("Résumé de référence : ", reference_summary)
# # print("Résumé généré : ", generated_summary)


: 

In [7]:
# Fonction pour traiter un chunk
def summarize_chunk(chunk):
    return summarize(chunk, "legal-pegasus", max_length=1000)

# Charger le fichier de référence
reference_summary = open_file(train_path_txt + '/' + train_files[0], 'txt')
print(f"Nombre de tokens dans le résumé de référence: {len(reference_summary)}")

# Diviser le texte en chunks
chunks = chunk_text(reference_summary)
print(f"Nombre de chunks: {len(chunks)}")

# Utiliser ThreadPoolExecutor pour le traitement parallèle
with concurrent.futures.ThreadPoolExecutor() as executor:
    generated_summary = list(executor.map(summarize_chunk, chunks))

print("Flag 2 - Traitement par lots terminé")

Nombre de tokens dans le résumé de référence: 279249
Nombre de chunks: 274


  return torch.load(checkpoint_file, map_location=map_location)


In [8]:
# Fonction pour calculer les scores ROUGE
def compute_rouge_scores(reference_summary, generated_summary):
    print("flag 3")
    rouge = Rouge()

    # Calcul des scores ROUGE-1, ROUGE-2 et ROUGE-L
    scores = rouge.get_scores(generated_summary, reference_summary)
    print("flag3-1")


    # S'assurer que des scores ont été calculés avant de procéder
    if len(scores) == 0:
        return {"error": "No scores could be computed. Check your input summaries."}

    # Calcul de la moyenne des scores
    avg_scores = {
        'rouge-1': {
            'precision': sum([score['rouge-1']['p'] for score in scores]) / len(scores) if len(scores) > 0 else 0,
            'recall': sum([score['rouge-1']['r'] for score in scores]) / len(scores) if len(scores) > 0 else 0,
            'f1-score': sum([score['rouge-1']['f'] for score in scores]) / len(scores) if len(scores) > 0 else 0,
        },
        'rouge-2': {
            'precision': sum([score['rouge-2']['p'] for score in scores]) / len(scores) if len(scores) > 0 else 0,
            'recall': sum([score['rouge-2']['r'] for score in scores]) / len(scores) if len(scores) > 0 else 0,
            'f1-score': sum([score['rouge-2']['f'] for score in scores]) / len(scores) if len(scores) > 0 else 0,
        },
        'rouge-l': {
            'precision': sum([score['rouge-l']['p'] for score in scores]) / len(scores) if len(scores) > 0 else 0,
            'recall': sum([score['rouge-l']['r'] for score in scores]) / len(scores) if len(scores) > 0 else 0,
            'f1-score': sum([score['rouge-l']['f'] for score in scores]) / len(scores) if len(scores) > 0 else 0,
        }
    }

    return avg_scores


In [9]:
generated_summary = ' '.join(generated_summary)

In [10]:
rouge_scores = compute_rouge_scores(reference_summary, generated_summary)

print("flag 4")

# Vérification et affichage des scores
if "error" in rouge_scores:
    print(rouge_scores["error"])
else:
    print("ROUGE-1: ", rouge_scores['rouge-1'])
    print("ROUGE-2: ", rouge_scores['rouge-2'])
    print("ROUGE-L: ", rouge_scores['rouge-l'])

flag 3
flag3-1
flag 4
ROUGE-1:  {'precision': 0.9477124183006536, 'recall': 0.027756508422664625, 'f1-score': 0.053933419569847665}
ROUGE-2:  {'precision': 0.8764478764478765, 'recall': 0.01004291465734637, 'f1-score': 0.019858279891465463}
ROUGE-L:  {'precision': 0.9477124183006536, 'recall': 0.027756508422664625, 'f1-score': 0.053933419569847665}


In [11]:
print(len(generated_summary))
print(len(reference_summary))

1685
279249


In [12]:
from bert_score import score

# Calculer le score BERTScore
P, R, F1 = score([generated_summary], [reference_summary], lang="en", verbose=True)

# Afficher les scores de Précision, Rappel et F1
print(f"Précision (P): {P.mean().item():.4f}")
print(f"Rappel (R): {R.mean().item():.4f}")
print(f"F1-Score: {F1.mean().item():.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.74 seconds, 0.58 sentences/sec
Précision (P): 0.9357
Rappel (R): 0.8927
F1-Score: 0.9137
