In [28]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [29]:
%pip install rouge-score bert_score



In [30]:
import os
import torch
from tqdm import tqdm
from rouge_score import rouge_scorer
from bert_score import BERTScorer

# Vérification de la disponibilité du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Test sur : {device}")

Test sur : cuda


In [31]:
model_path = "/content/drive/MyDrive/legal-pegasus-model-paper"

test_path = '/content/drive/MyDrive/dataset_legal-pegasus/dataset/UK-Abs/test-data'

test_path_txt = test_path + '/judgement'
test_path_summary = test_path + '/summary/full'

In [32]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Charger le modèle et le tokenizer
model_name = "nsi319/legal-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)
model.to(device)

# model = PegasusForConditionalGeneration.from_pretrained(model_path)
# tokenizer = PegasusTokenizer.from_pretrained(model_path)

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0-15): 16 x PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): ReLU()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_no

In [33]:
import os

def load_test_data(txt_path, summary_path):
    """Charge les fichiers texte et résumés pour le jeu de test."""
    texts = [open(os.path.join(txt_path, file_name), 'r', encoding='utf-8').read()
             for file_name in os.listdir(txt_path)]
    summaries = [open(os.path.join(summary_path, file_name), 'r', encoding='utf-8').read()
                 for file_name in os.listdir(summary_path)]
    return texts, summaries

In [34]:
def chunk_text(text, chunk_size=1024, tokenizer=None):
    """
    Divise le texte en segments avec un maximum de chunk_size tokens.
    """
    if tokenizer is None:
        raise ValueError("Un tokenizer doit être fourni.")

    # Tokenisation complète du texte
    tokenized_text = tokenizer(text, return_tensors="pt", truncation=False, padding=False).input_ids[0]

    # Découper le texte en segments
    chunks = []
    for i in range(0, len(tokenized_text), chunk_size):
        chunk = tokenized_text[i:i + chunk_size]  # Sélectionner un segment de tokens
        chunks.append(tokenizer.decode(chunk, skip_special_tokens=True))  # Décoder le segment en texte

    return chunks

In [35]:
def generate_summary(model, tokenizer, text, max_input_length=1024, max_output_length=256):
    """Génère un résumé pour un texte donné."""
    inputs = tokenizer(text, max_length=max_input_length, truncation=True, return_tensors="pt", padding="longest").input_ids
    inputs = inputs.to(model.device)

    output_ids = model.generate(inputs, max_length=max_output_length, num_beams=5, length_penalty=2.0, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [36]:
def generate_full_summary(model, tokenizer, text, max_input_length=1024, max_output_length=256):
    """Génère un résumé complet pour un texte en traitant chaque segment."""
    chunks = chunk_text(text, chunk_size=max_input_length, tokenizer=tokenizer)
    chunk_summaries = [generate_summary(model, tokenizer, chunk, max_input_length, max_output_length) for chunk in chunks]

    return ' '.join(chunk_summaries)

In [37]:
def evaluate_model(model, tokenizer, texts, references, max_input_length=1024, max_output_length=256):
    """Évalue les performances du modèle sur les données de test."""
    rouge_scorer_instance = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    bert_scorer_instance = BERTScorer(lang="en", rescale_with_baseline=True)

    rouge_scores, bert_scores, summaries = [], [], []

    for text, reference in zip(texts, references):
        generated_summary = generate_full_summary(model, tokenizer, text, max_input_length, max_output_length)

        summaries.append(generated_summary)

        # Calcul des scores ROUGE
        rouge_scores.append(rouge_scorer_instance.score(reference, generated_summary))

        # Calcul des scores BERT
        bert_scores.append(bert_scorer_instance.score([generated_summary], [reference]))

        break

    avg_rouge = {
        'rouge1': sum(s['rouge1'].fmeasure for s in rouge_scores) / len(rouge_scores),
        'rouge2': sum(s['rouge2'].fmeasure for s in rouge_scores) / len(rouge_scores),
        'rougeL': sum(s['rougeL'].fmeasure for s in rouge_scores) / len(rouge_scores)
    }

    avg_bert_score = sum(s[2].mean().item() for s in bert_scores) / len(bert_scores)

    return {**avg_rouge, 'bert_score': avg_bert_score}, summaries

In [38]:
# Charger les données
test_path_txt = test_path + '/judgement'
test_path_summary = test_path + '/summary/full'
texts, summaries = load_test_data(test_path_txt, test_path_summary)

In [39]:
# Évaluer le modèle
results, summaries_gen = evaluate_model(model, tokenizer, texts, summaries)
print("Scores ROUGE et BERT :", results)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (3354 > 1024). Running this sequence through the model will result in indexing errors


Scores ROUGE et BERT : {'rouge1': 0.3962362780972295, 'rouge2': 0.0826792255363684, 'rougeL': 0.1975953998954522, 'bert_score': -0.12597428262233734}


In [40]:
# import pandas as pd

# df_results = pd.DataFrame({"Text": texts, "Reference": summaries, "Generated": [generate_summary(model, tokenizer, t) for t in texts]})
# df_results.to_csv("/content/drive/MyDrive/results_legal-pegasus.csv", index=False)

In [41]:
summaries_gen[0]

'The question at issue is what connection must a foreign company have with the United Kingdom to entitle an English court to wind it up, if its centre of main interests (or COMI) is in another member state of the European Union. The answer depends on the meaning of two words, economic activity, in EU Regulation 1346/2000 on Insolvency Proceedings. The English court has jurisdiction under its domestic law to wind up a foreign company. However, in the case of companies whose COMI is in another member state of the EU, the exercise of this power is constrained by the Regulation. The effect of those proceedings shall be restricted to the assets of the debtor situated in the territory of the latter Member State. The question arises whether Olympic had an establishment in the United Kingdom on 20 July 2010 so as to justify the presentation of a winding up petition on that date. Olympic is the principal employer in the pension scheme and the only employer currently participating in it. Under t

In [42]:
texts[0]

"The question at issue on this appeal is what connection must a foreign company have with the United Kingdom to entitle an English court to wind it up, if its centre of main interests (or COMI) is in another member state of the European Union.\nThe answer depends on the meaning of two words, economic activity, in EU Regulation 1346/2000 on Insolvency Proceedings.\nThe legal framework\nUnder section 221 of the Insolvency Act 1986, the English court has jurisdiction under its domestic law to wind up a foreign company.\nHowever, in the case of companies whose COMI is in another member state of the EU, the exercise of this power is constrained by the Regulation.\nArticle 3 of the Regulation provides as follows: Article 3 International jurisdiction 1.\nThe courts of the Member State within the territory of which the centre of a debtor's main interests is situated shall have jurisdiction to open insolvency proceedings.\nIn the case of a company or legal person, the place of the registered of