In [1]:
import torch
import transformers
import gc
from transformers import AutoTokenizer, MT5ForSequenceClassification, BitsAndBytesConfig, Trainer, TrainingArguments, T5ForSequenceClassification
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType

from trl import SFTTrainer

In [3]:
torch.cuda.empty_cache()

model_id = "google/flan-t5-xxl"
#model_id = "google/mt5-xl"

model = T5ForSequenceClassification.from_pretrained(model_id, num_labels=3, device_map="auto")
#model = MT5ForSequenceClassification.from_pretrained(model_id, num_labels=3, device_map="auto")

tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-xxl and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from peft import prepare_model_for_kbit_training

torch.cuda.empty_cache()

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
lora_config = LoraConfig(
    r=1024,
    lora_alpha=4096,
    lora_dropout=0.2,
    bias="none",
    task_type=TaskType.SEQ_CLS
)

model = get_peft_model(model, lora_config)

In [4]:
# Inclure la couche de classification avec LoRA
for param in model.parameters():
    param.requires_grad = False

for name, param in model.named_parameters():
    if "classification_head" in name or "lora" in name:
        param.requires_grad = True

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 16793603 || all params: 11020529667 || trainable%: 0.15238471749944069


In [6]:
# data = load_dataset("ankitkupadhyay/XNLI")["train"]
data = load_dataset("Gameselo/monolingual-wideNLI")

data = data.shuffle(seed=1234)  # Shuffle dataset here
data = data.rename_column("label", "labels")

In [None]:
# IF USING XNLI

data = data.train_test_split(test_size=0.1)
train_data = data["train"]
test_data = data["test"]

In [7]:
# IF USING MONOLINGUAL-WIDENLI

train_data = data["train"]
test_data = data["dev"]

In [8]:
# Fonction pour sampler N exemples par classe
def sample_per_class(dataset, num_samples=128):
    # Obtenir un index unique pour chaque classe
    class_indices = {label: [] for label in set(dataset['labels'])}

    # Accumuler les indices pour chaque classe
    for index, label in enumerate(dataset['labels']):
        class_indices[label].append(index)

    # Sélectionner aléatoirement num_samples indices pour chaque classe
    import random
    sampled_indices = [index for label, indices in class_indices.items()
                       for index in random.sample(indices, num_samples)]

    # Créer un nouveau dataset à partir des indices échantillonnés
    sampled_dataset = dataset.select(sampled_indices)
    return sampled_dataset

# Appliquer la fonction
train_data = sample_per_class(train_data)

In [9]:
def preprocess_function(examples):
    task_description = "Determine if the hypothesis is true based on the premise."
    inputs = [f"Task: {task_description} Premise: {premise} Hypothesis: {hypothesis} Label (Entailment, Neutral, Contradiction):" for premise, hypothesis in zip(examples['premise'], examples['hypothesis'])]
    # Tokeniser les inputs en batch
    labels = examples['labels']
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=1000)
    model_inputs["labels"] = labels
    return model_inputs

In [10]:
# Apply the tokenization and preparation function
train_data = train_data.map(preprocess_function, batched=True)
test_data = test_data.map(preprocess_function, batched=True)

Map:   0%|          | 0/384 [00:00<?, ? examples/s]

In [11]:
import evaluate
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np
from datasets import load_dataset, load_metric

def compute_metrics(eval_pred):
    logits, labels = eval_pred  # eval_pred is the tuple of predictions and labels returned by the model

    logits = np.array(logits[0])
    
    predictions = np.argmax(logits, axis=-1)
    
    # Calculate metrics, assuming 'average' as 'weighted' for handling multiclass classification
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    
    # Return a dictionary with the computed metrics
    return {
        "precision": precision, 
        "recall": recall, 
        "f1-score": f1, 
        "accuracy": accuracy
    }

In [12]:
#new code using SFTTrainer
torch.cuda.empty_cache()
gc.collect()

# Configuration des arguments de l'entraînement
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    warmup_ratio=0.08,
    learning_rate=1e-4,
    output_dir="outputs",
    optim="paged_adamw_8bit",
    save_strategy="no"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    args=training_args,
    max_seq_length=1024,
    data_collator=transformers.DataCollatorWithPadding(tokenizer),
    formatting_func=preprocess_function,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [13]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()
#trainer.save_model('./best_model_mt5')

Step,Training Loss


TrainOutput(global_step=144, training_loss=0.8650012016296387, metrics={'train_runtime': 2611.2034, 'train_samples_per_second': 0.441, 'train_steps_per_second': 0.055, 'total_flos': 7.5264279204096e+16, 'train_loss': 0.8650012016296387, 'epoch': 3.0})

## Inference

In [18]:
model.config.use_cache = True

In [19]:
# Step 1: Load ANLI test rounds
anli_r1 = load_dataset("anli", split="test_r1[:10%]")
anli_r2 = load_dataset("anli", split="test_r2[:10%]")
anli_r3 = load_dataset("anli", split="test_r3[:10%]")

In [20]:
# Function to process the ANLI dataset rounds
def process_anli_data(dataset):
    # Tokenize the data
    dataset = dataset.rename_column("label", "labels")
    dataset = dataset.map(preprocess_function, batched=True, remove_columns=[col for col in dataset.column_names if col not in ['premise', 'hypothesis', 'labels']])
    return dataset

# Step 2: Process the data
anli_r1 = process_anli_data(anli_r1)
anli_r2 = process_anli_data(anli_r2)
anli_r3 = process_anli_data(anli_r3)

In [21]:
# Step 3: Define predict function with metrics
torch.cuda.empty_cache()
gc.collect()

def predict_and_evaluate(dataset):
    with torch.inference_mode():
        predictions = trainer.evaluate(dataset)
    print("Precision:", predictions['eval_precision'])
    print("Recall:", predictions['eval_recall'])
    print("F1-score:", predictions['eval_f1-score'])
    print("Accuracy:", predictions['eval_accuracy'])

# Step 4: Run predictions and compute metrics for each ANLI round
print("Evaluating ANLI R1")
predict_and_evaluate(anli_r1)

print("Evaluating ANLI R2")
predict_and_evaluate(anli_r2)

print("Evaluating ANLI R3")
predict_and_evaluate(anli_r3)

Evaluating ANLI R1
Precision: 0.7469565217391305
Recall: 0.75
F1-score: 0.7428057713651498
Accuracy: 0.75
Evaluating ANLI R2
Precision: 0.6152727272727273
Recall: 0.62
F1-score: 0.6156503496503496
Accuracy: 0.62
Evaluating ANLI R3
Precision: 0.5387996031746031
Recall: 0.525
F1-score: 0.5208517539977771
Accuracy: 0.525


In [24]:
test_vitaminc = load_dataset("Gameselo/monolingual-wideNLI", split="test_vitaminc[:2%]")
test_vitaminc = process_anli_data(test_vitaminc)

torch.cuda.empty_cache()
gc.collect()

# Step 4: Run predictions and compute metrics for each ANLI round
print("Evaluating VitaminC")
predict_and_evaluate(test_vitaminc)

Evaluating VitaminC


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.14 GiB. GPU 

In [None]:
trainer.save_model('./best_model_mt5')