In [88]:
import torch
import transformers
import gc
from transformers import AutoTokenizer, MT5ForSequenceClassification, BitsAndBytesConfig, Trainer, TrainingArguments, T5ForSequenceClassification
from datasets import load_dataset, concatenate_datasets
from peft import LoraConfig, get_peft_model, TaskType

from trl import SFTTrainer

In [89]:
torch.cuda.empty_cache()
gc.collect()

model_id = "google/flan-t5-xl"
#model_id = "philschmid/flan-t5-xxl-sharded-fp16"
#model_id = "google/mt5-xl"

id2label = {0: "entailment", 1: "neutral", 2: "contradiction"}
label2id = {"entailment": 0, "neutral": 1, "contradiction": 2}

model = T5ForSequenceClassification.from_pretrained(model_id, num_labels=3, id2label=id2label, label2id=label2id, device_map="auto")
#model = MT5ForSequenceClassification.from_pretrained(model_id, num_labels=3, device_map="auto", return_dict=True)

tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-xl and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [90]:
from peft import prepare_model_for_kbit_training

torch.cuda.empty_cache()
gc.collect()

model = prepare_model_for_kbit_training(model)

In [91]:
# Define LoRA Config
lora_config = LoraConfig(
    r=256,
    lora_alpha=256,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    modules_to_save=['classification_head']
)

peft_model = get_peft_model(model, lora_config)

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(peft_model)

trainable params: 155197443 || all params: 2943358982 || trainable%: 5.272800360034371


In [92]:
# data = load_dataset("ankitkupadhyay/XNLI")["train"]
# data = load_dataset("Gameselo/monolingual-wideNLI")
anli_r1 = load_dataset('anli', split='train_r1')
anli_r2 = load_dataset('anli', split='train_r2')
anli_r3 = load_dataset('anli', split='train_r3')

# Concaténer les datasets
data = concatenate_datasets([anli_r1, anli_r2, anli_r3])

data = data.shuffle(seed=1234)  # Shuffle dataset here

In [None]:
# IF USING XNLI

data = data.train_test_split(test_size=0.1)
train_data = data["train"]
test_data = data["test"]

In [None]:
# IF USING MONOLINGUAL-WIDENLI

anli_r1 = load_dataset('anli', split='dev_r1')
anli_r2 = load_dataset('anli', split='dev_r2')
anli_r3 = load_dataset('anli', split='dev_r3')

train_data = data
test_data = concatenate_datasets([anli_r1, anli_r2, anli_r3])

In [None]:
# Fonction pour sampler N exemples par classe
def sample_per_class(dataset, num_samples=256):
    # Obtenir un index unique pour chaque classe
    class_indices = {label: [] for label in set(dataset['label'])}

    # Accumuler les indices pour chaque classe
    for index, label in enumerate(dataset['label']):
        class_indices[label].append(index)

    # Sélectionner aléatoirement num_samples indices pour chaque classe
    import random
    sampled_indices = [index for label, indices in class_indices.items()
                       for index in random.sample(indices, num_samples)]

    # Créer un nouveau dataset à partir des indices échantillonnés
    sampled_dataset = dataset.select(sampled_indices)
    return sampled_dataset

# Appliquer la fonction
train_data = sample_per_class(train_data)

In [None]:
def preprocess_function(examples):
    task_description = "Determine if the hypothesis is true based on the premise."
    inputs = [f"Task: {task_description} Premise: {premise} Hypothesis: {hypothesis} Label (entailment, neutral, contradiction):" for premise, hypothesis in zip(examples['premise'], examples['hypothesis'])]
    # Tokeniser les inputs en batch
    labels = examples['label']
    model_inputs = tokenizer(inputs, padding="max_length", truncation=True, max_length=1000, add_special_tokens=True)
    model_inputs["label"] = labels
    return model_inputs

In [None]:
# Apply the tokenization and preparation function
train_data = train_data.map(preprocess_function, batched=True)
test_data = test_data.map(preprocess_function, batched=True)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    outputs, labels = eval_pred  # eval_pred is the tuple of predictions and labels returned by the model
    
    # Obtenir les logits des outputs
    logits = outputs[0]
    
    # Appliquer la fonction Softmax pour obtenir les probabilités
    probabilities = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
    
    # Prendre l'argmax des probabilités pour obtenir les prédictions
    predictions = np.argmax(probabilities, axis=-1)
    
    # Calculate metrics, assuming 'average' as 'weighted' for handling multiclass classification
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    
    # Calculate accuracy for each class
    class_accuracies = {}
    for cls in range(3):
        class_indices = (labels == cls)
        class_accuracy = accuracy_score(labels[class_indices], predictions[class_indices])
        class_accuracies[f"accuracy_class_{cls}"] = class_accuracy
    
    # Return a dictionary with the computed metrics
    metrics = {
        "precision": precision, 
        "recall": recall, 
        "f1-score": f1, 
        "accuracy": accuracy
    }
    
    # Add class-wise accuracies to the metrics dictionary
    metrics.update(class_accuracies)
    
    return metrics

In [None]:
#new code using SFTTrainer
torch.cuda.empty_cache()
gc.collect()

# Configuration des arguments de l'entraînement
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    lr_scheduler_type="constant_with_warmup",
    warmup_ratio=0.08,
    weight_decay=0.01, # if not, not stable
    learning_rate=1e-4,
    output_dir="outputs",
    optim='adafactor', # designed for T5
    evaluation_strategy='no',
    save_strategy="no",
    fp16=False,
    num_train_epochs=3,
    logging_steps=10,
    load_best_model_at_end=True,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=lora_config,
    args=training_args,
    max_seq_length=1024,
    data_collator=transformers.DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=10),
    formatting_func=preprocess_function,
    compute_metrics=compute_metrics
)

In [None]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

## Inference

In [None]:
model.config.use_cache = True

In [None]:
# Step 1: Load ANLI test rounds
anli_r1 = load_dataset("anli", split="test_r1[:10%]")
anli_r2 = load_dataset("anli", split="test_r2[:10%]")
anli_r3 = load_dataset("anli", split="test_r3[:10%]")

In [None]:
# Function to process the ANLI dataset rounds
def process_anli_data(dataset):
    # Tokenize the data
    dataset = dataset.map(preprocess_function, batched=True, remove_columns=[col for col in dataset.column_names if col not in ['premise', 'hypothesis', 'label']])
    return dataset

# Step 2: Process the data
anli_r1 = process_anli_data(anli_r1)
anli_r2 = process_anli_data(anli_r2)
anli_r3 = process_anli_data(anli_r3)

In [None]:
# Step 3: Define predict function with metrics
torch.cuda.empty_cache()
gc.collect()

def predict_and_evaluate(dataset):
    with torch.inference_mode():
        predictions = trainer.evaluate(dataset)
    print("Precision:", predictions['eval_precision'])
    print("Recall:", predictions['eval_recall'])
    print("F1-score:", predictions['eval_f1-score'])
    print("Accuracy:", predictions['eval_accuracy'])
    print("Accuracy entailment:", predictions['eval_accuracy_class_0'])
    print("Accuracy neutral:", predictions['eval_accuracy_class_1'])
    print("Accuracy contradiction:", predictions['eval_accuracy_class_2'])

# Step 4: Run predictions and compute metrics for each ANLI round
print("Evaluating ANLI R1")
predict_and_evaluate(anli_r1)

print("Evaluating ANLI R2")
predict_and_evaluate(anli_r2)

print("Evaluating ANLI R3")
predict_and_evaluate(anli_r3)

In [None]:
test_vitaminc = load_dataset("Gameselo/monolingual-wideNLI", split="test_vitaminc[:2%]")
test_vitaminc = process_anli_data(test_vitaminc)

torch.cuda.empty_cache()
gc.collect()

# Step 4: Run predictions and compute metrics for each ANLI round
print("Evaluating VitaminC")
predict_and_evaluate(test_vitaminc)

In [None]:
trainer.save_model('./best_model_mt5')