In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%pip install transformers datasets torch



In [3]:
train_path = '/content/drive/MyDrive/dataset_legal-pegasus/dataset/UK-Abs/train-data'
test_path = '/content/drive/MyDrive/dataset_legal-pegasus/dataset/UK-Abs/test-data'

train_path_txt = train_path + '/judgement'
train_path_summary = train_path + '/summary'
test_path_txt = test_path + '/judgement'
test_path_summary = test_path + '/summary/full'

In [4]:
import torch

# Vérification de la disponibilité du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Entraînement sur : {device}")

Entraînement sur : cuda


In [5]:
import os
from datasets import Dataset

def load_data(txt_path, summary_path):
    """Charge les fichiers texte et résumés pour créer un Dataset."""
    texts = []
    summaries = []

    # Charger les fichiers de texte
    for file_name in os.listdir(txt_path):
        with open(os.path.join(txt_path, file_name), 'r', encoding='utf-8') as f:
            texts.append(f.read())

    # Charger les fichiers de résumés
    for file_name in os.listdir(summary_path):
        with open(os.path.join(summary_path, file_name), 'r', encoding='utf-8') as f:
            summaries.append(f.read())

    # Créer un Dataset Hugging Face
    data = {"text": texts, "summary": summaries}
    return Dataset.from_dict(data)

# Charger les données
train_dataset = load_data(train_path_txt, train_path_summary)
test_dataset = load_data(test_path_txt, test_path_summary)

# Vérifier les exemples
print(train_dataset[0])


{'text': 'Mr David Price, the Solicitor Advocate acting for the defendants, who are the appellants, at one point described the claim as a storm in a teacup.\nHe was correct, but the storm is considerable.\nIt involves consideration of one of the most difficult areas of the law of defamation, the defence of fair comment.\nOver 40 years ago Diplock LJ in Slim v Daily Telegraph Ltd [1968] 2 QB 157, 171 referred to the artificial and archaic character of the tort of libel.\nSome 20 years on Parker LJ in Brent Walker Group plc v Time Out Ltd [1991] 2 QB 33, 46 commented on the absurdity of the tangled web of the law of defamation.\nLittle has occurred in the last twenty years to unravel the tangle, and this is particularly true of the defence of fair comment.\nStatutory reform has been proposed in the form of Lord Lester of Herne Hills private members Defamation Bill and the Ministry of Justice has undertaken to publish a draft Defamation Bill early next year.\nBoth Mr Price, and Mr Caldeco

In [6]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Charger le modèle et le tokenizer
model_name = "nsi319/legal-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)
model.to(device)
model.gradient_checkpointing_enable()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [7]:
def preprocess_function(examples):
    """Tokenisation des entrées et des cibles."""
    inputs = tokenizer(examples["text"], max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(examples["summary"], max_length=256, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Préparer les données
train_tokenized = train_dataset.map(preprocess_function, batched=True)
test_tokenized = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/693 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [8]:
from transformers import TrainingArguments, Trainer

# Arguments d'entraînement
training_args = TrainingArguments(
    output_dir="./legal-pegasus",  # Dossier de sortie
    eval_strategy="epoch", # Évaluer après chaque époque
    per_device_train_batch_size=1,  # Batch size pour l'entraînement
    per_device_eval_batch_size=1,  # Batch size pour l'évaluation
    learning_rate=5e-5,
    num_train_epochs=3,  # Nombre d'époques
    save_strategy="epoch",  # Sauvegarder le modèle après chaque époque
    save_total_limit=2,  # Limiter le nombre de checkpoints
    logging_dir="./logs",  # Dossier pour les logs
    logging_steps=100,
    weight_decay=0.01,
    run_name="Legal-Pegasus-Training",  # Nom du run pour Wandb
    report_to=[],  # Désactiver Wandb si nécessaire
)

# Configuration du Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
)

torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

# Lancer l'entraînement
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,3.2002,3.040736
2,3.0225,2.985985
3,2.9451,2.978451




TrainOutput(global_step=2079, training_loss=3.1147259321850385, metrics={'train_runtime': 2901.5651, 'train_samples_per_second': 0.717, 'train_steps_per_second': 0.717, 'total_flos': 6007196517138432.0, 'train_loss': 3.1147259321850385, 'epoch': 3.0})

In [10]:
model.save_pretrained("./legal-pegasus-model")
tokenizer.save_pretrained("./legal-pegasus-model")

('./legal-pegasus-model/tokenizer_config.json',
 './legal-pegasus-model/special_tokens_map.json',
 './legal-pegasus-model/spiece.model',
 './legal-pegasus-model/added_tokens.json')