In [1]:
%pip install transformers

^C
Note: you may need to restart the kernel to use updated packages.




In [None]:
train_path = '/content/drive/MyDrive/SCOTU_data_txt_save'
test_path = '/content/drive/MyDrive/SCOTU_data_txt_save'

train_path_txt = train_path + '/text'
train_path_summary = train_path + '/summary'
test_path_txt = test_path + '/text_dev'
test_path_summary = test_path + '/summary_dev'

In [None]:
import torch

# Vérification de la disponibilité du GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Entraînement sur : {device}")

Entraînement sur : cuda


In [None]:
import os
from datasets import Dataset

def load_data(txt_path, summary_path):
    """Charge les fichiers texte et résumés pour créer un Dataset."""
    texts = []
    summaries = []

    # Charger les fichiers de texte
    for file_name in os.listdir(txt_path):
        with open(os.path.join(txt_path, file_name), 'r', encoding='utf-8') as f:
            texts.append(f.read())

    # Charger les fichiers de résumés
    for file_name in os.listdir(summary_path):
        with open(os.path.join(summary_path, file_name), 'r', encoding='utf-8') as f:
            summaries.append(f.read())
    # Créer un Dataset Hugging Face
    data = {"text": texts, "summary": summaries}
    return Dataset.from_dict(data)

# Charger les données
train_dataset = load_data(train_path_txt, train_path_summary)
test_dataset = load_data(test_path_txt, test_path_summary)

# Vérifier les exemples
print(train_dataset[0])

In [None]:
from transformers import BartTokenizer

# Initialiser le tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

def preprocess_data(examples):
    """Tokenisation des données pour BART."""
    inputs = tokenizer(
        examples["text"], max_length=1024, truncation=True, padding="max_length", return_tensors="pt"
    )
    labels = tokenizer(
        examples["summary"], max_length=256, truncation=True, padding="max_length", return_tensors="pt"
    )
    inputs["labels"] = labels["input_ids"]
    return inputs

# Prétraitement des datasets
train_dataset = train_dataset.map(preprocess_data, batched=True)
test_dataset = test_dataset.map(preprocess_data, batched=True)

# Supprimer les colonnes non nécessaires après prétraitement
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
from transformers import BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Charger le modèle BART
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
model.to(device)

# Configurer les paramètres d'entraînement
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",       # Dossier pour sauvegarder les résultats
    evaluation_strategy="epoch", # Évaluation après chaque époque
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",         # Dossier pour les logs
    logging_steps=100,
    save_steps=500,
    fp16=torch.cuda.is_available(),  # Utiliser le calcul en 16 bits si le GPU le permet
)

# Créer l'entraîneur
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# Lancer l'entraînement
trainer.train()

In [None]:
from datasets import load_metric

metric = load_metric("rouge")

def evaluate_model(model, tokenizer, dataset):
    predictions = []
    references = []

    for example in dataset:
        inputs = tokenizer(example["text"], return_tensors="pt", truncation=True, max_length=1024).to(device)
        summary_ids = model.generate(inputs["input_ids"], max_length=256, num_beams=4, early_stopping=True)
        prediction = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        reference = example["summary"]

        predictions.append(prediction)
        references.append(reference)
    
    return metric.compute(predictions=predictions, references=references)

# Évaluer le modèle
results = evaluate_model(model, tokenizer, test_dataset)
print(results)