In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%pip install transformers datasets torch



In [None]:
train_path = '/content/drive/MyDrive/SCOTU_data_txt_save'
test_path = '/content/drive/MyDrive/SCOTU_data_txt_save'

train_path_txt = train_path + '/text'
train_path_summary = train_path + '/summary'
test_path_txt = test_path + '/text_dev'
test_path_summary = test_path + '/summary_dev'

In [None]:
import torch

# Vérification de la disponibilité du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Entraînement sur : {device}")

Entraînement sur : cuda


In [None]:
import os
from datasets import Dataset

def load_data(txt_path, summary_path):
    """Charge les fichiers texte et résumés pour créer un Dataset."""
    texts = []
    summaries = []

    # Charger les fichiers de texte
    for file_name in os.listdir(txt_path)[:3]:
        with open(os.path.join(txt_path, file_name), 'r', encoding='utf-8') as f:
            texts.append(f.read())

    # Charger les fichiers de résumés
    for file_name in os.listdir(summary_path)[:3]:
        with open(os.path.join(summary_path, file_name), 'r', encoding='utf-8') as f:
            summaries.append(f.read())
    # Créer un Dataset Hugging Face
    data = {"text": texts, "summary": summaries}
    return Dataset.from_dict(data)

# Charger les données
train_dataset = load_data(train_path_txt, train_path_summary)
test_dataset = load_data(test_path_txt, test_path_summary)

# Vérifier les exemples
print(train_dataset[0])


{'text': 'OPINION OF THE COURTOREGON V. GUZEK546 U. S. ____ (2006)SUPREME COURT OF THE UNITED STATESNO. 04-928 \n  OREGON, PETITIONER  v.  RANDY LEE GUZEK\n  on writ of certiorari to the supreme court of oregon\n  [February 22, 2006]\n  \xa0\xa0\xa0Justice Breyer delivered the opinion of the Court.\n  \xa0\xa0\xa0Respondent Randy Lee Guzek was found guilty of capital murder and sentenced to death. On appeal, the Oregon Supreme Court affirmed the conviction but vacated the sentence and ordered a new sentencing proceeding. The question before the Court is whether the State may limit the innocence-related evidence he can introduce at that proceeding to the evidence he introduced at his original trial. We hold that the limitation does not violate the Constitution.\n  I\n  \xa0\xa0\xa0Oregon tried Guzek for the offense of capital murder. The evidence showed that Guzek and two associates decided to burglarize the Houser family home, that they entered the house, that an associate killed Rod H

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Charger le modèle et le tokenizer
model_name = "nsi319/legal-pegasus"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)
model.to(device)
model.gradient_checkpointing_enable()

In [None]:
def preprocess_function(examples):
    """Tokenisation des entrées et des cibles."""
    inputs = tokenizer(examples["text"], max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(examples["summary"], max_length=256, truncation=True, padding="max_length")
    inputs["labels"] = labels["input_ids"]
    return inputs

# Préparer les données
train_tokenized = train_dataset.map(preprocess_function, batched=True)
test_tokenized = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer

# Arguments fine-tuning
training_args = TrainingArguments(
    output_dir="./legal-pegasus",
    eval_strategy="epoch",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    learning_rate=5e-5,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    weight_decay=0.01,
    run_name="Legal-Pegasus-Training",
    report_to=[],
)

# Configuration du Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=test_tokenized,
    tokenizer=tokenizer,
)

torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

# Lancer l'entraînement
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,3.012164
2,No log,2.956009
3,No log,2.936817




TrainOutput(global_step=9, training_loss=2.9261622958713107, metrics={'train_runtime': 256.0499, 'train_samples_per_second': 0.035, 'train_steps_per_second': 0.035, 'total_flos': 26005179727872.0, 'train_loss': 2.9261622958713107, 'epoch': 3.0})

In [None]:
model.save_pretrained("/content/drive/MyDrive/legal-pegasus-model-Scopus")
tokenizer.save_pretrained("/content/drive/MyDrive/legal-pegasus-model-Scopus")

('./legal-pegasus-model-Scopus/tokenizer_config.json',
 './legal-pegasus-model-Scopus/special_tokens_map.json',
 './legal-pegasus-model-Scopus/spiece.model',
 './legal-pegasus-model-Scopus/added_tokens.json')