In [93]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [94]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [106]:
torch.cuda.empty_cache()

model_id = "google/mt5-xxl"
#model_id = "google/mt5-xl"

model = AutoModelForSequenceClassification.from_pretrained(model_id, quantization_config=bnb_config, num_labels=3, device_map={"":torch.cuda.current_device()})
tokenizer = AutoTokenizer.from_pretrained(model_id)

Some weights of MT5ForSequenceClassification were not initialized from the model checkpoint at google/mt5-xxl and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [107]:
from datasets import load_dataset

data = load_dataset("ankitkupadhyay/XNLI")["train"]

data = data.shuffle(seed=1234)  # Shuffle dataset here
data = data.rename_column("label", "labels")

# Explore the data
df = data.to_pandas()
df.head(10)

Unnamed: 0,premise,hypothesis,labels
0,'Блискам ли? ',Тогава бях много емоционален.,1
1,Matumizi zaidi!,Matumizi hayana manufaa katika hali hii.,2
2,Най-очарователната структура на Калтън Хил е Н...,Националният паметник е най-голямата структура...,0
3,Vigezo vya News ' vinaweza kutoa mzozo kama huu.,Vigezo vipya vinaweza kuleta mzozo.,0
4,"Helen Gurley Brown, sobre los muchos aspectos ...",Helen Gurley Brown cree que el acoso sexual es...,2
5,เขาเขียนถึงฉันในอีเมล ฉันเชื่อว่าหลักฐานของ To...,อีเมลได้รวมรายละเอียดของหลักฐาน,1
6,Các nhân vật chính nhận được nhiều sự điều trị...,Thật kỳ lạ khi cả nhân vật chính và phụ đều đư...,0
7,一楼是法兰西历史博物馆，珍藏着她一生中唯一已知的圣女贞德肖像和路易十六保存的日记等珍品。,圣女贞德的画像在五楼。,2
8,اس معاملے میں، اس کا بوڑھا دھوکہ دینے والا اس ...,Ginsburg نے اس کی نمائندگی کے باوجود اسے دھوکہ...,1
9,Ngài James gõ bàn một cách khá sốt ruột.,Ngài James đã mất kiên nhẫn.,0


In [108]:
data = data.train_test_split(test_size=0.1)
train_data = data["train"]
test_data = data["test"]

In [109]:
# Fonction pour sampler 8 exemples par classe
def sample_per_class(dataset, num_samples=8):
    # Obtenir un index unique pour chaque classe
    class_indices = {label: [] for label in set(dataset['labels'])}

    # Accumuler les indices pour chaque classe
    for index, label in enumerate(dataset['labels']):
        class_indices[label].append(index)

    # Sélectionner aléatoirement num_samples indices pour chaque classe
    import random
    sampled_indices = [index for label, indices in class_indices.items()
                       for index in random.sample(indices, num_samples)]

    # Créer un nouveau dataset à partir des indices échantillonnés
    sampled_dataset = dataset.select(sampled_indices)
    return sampled_dataset

# Appliquer la fonction
train_data = sample_per_class(train_data)

In [110]:
def tokenize_and_prepare_data(examples):
    # Utiliser un format de prompt spécifique pour mT5
    task_description = "Determine if the hypothesis is true based on the premise."
    text = f"Task: {task_description} Premise: {examples['premise']} Hypothesis: {examples['hypothesis']}"
    result = tokenizer(text,
                       padding="max_length",
                       truncation=True,   
                       max_length=1000,
                       return_overflowing_tokens=True,
                       add_special_tokens=True)
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

# Apply the tokenization and preparation function
train_data = train_data.map(tokenize_and_prepare_data, batched=True)
test_data = test_data.map(tokenize_and_prepare_data, batched=True)

In [111]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [112]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [113]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=24,
    lora_dropout=0.15,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)

peft_model = get_peft_model(model, lora_config)
print_trainable_parameters(peft_model)

trainable params: 9437184 || all params: 7485255683 || trainable%: 0.1260769758531173


In [114]:
import evaluate
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    if isinstance(logits, tuple):
        logits = logits[0]  # Assuming the first element of the tuple is the logits array

    # If logits are still in list form or have variable lengths, ensure they are properly converted to a uniform numpy array
    if isinstance(logits, list):
        max_len = max(len(l) for l in logits)
        logits = np.array([np.pad(l, (0, max_len - len(l))) for l in logits])

    predictions = np.argmax(logits, axis=-1)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)

    return {
        "precision": precision, 
        "recall": recall, 
        "f1-score": f1, 
        "accuracy": accuracy
    }

In [115]:
#new code using SFTTrainer
import transformers
import gc

from trl import SFTTrainer

torch.cuda.empty_cache()
gc.collect()

# Configuration des arguments de l'entraînement
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=2,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    warmup_ratio=0.08,
    learning_rate=1e-4,
    output_dir="outputs",
    optim="paged_adamw_8bit",
    save_strategy="no"
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=lora_config,
    args=training_args,
    max_seq_length=1024,
    data_collator=transformers.DataCollatorWithPadding(tokenizer),
    formatting_func=tokenize_and_prepare_data,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [116]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



Step,Training Loss


TrainOutput(global_step=3, training_loss=0.24180551369984946, metrics={'train_runtime': 44.7175, 'train_samples_per_second': 0.134, 'train_steps_per_second': 0.067, 'total_flos': 392341192812000.0, 'train_loss': 0.24180551369984946, 'epoch': 3.0})

## Inference

In [117]:
model.config.use_cache = True

In [133]:
# Step 1: Load ANLI test rounds
anli_r1 = load_dataset("anli", split="train_r1[:5%]")
anli_r2 = load_dataset("anli", split="train_r2[:5%]")
anli_r3 = load_dataset("anli", split="train_r3[:3%]")

In [134]:
# Function to process the ANLI dataset rounds
def process_anli_data(dataset):
    # Tokenize the data
    dataset = dataset.rename_column("label", "labels")
    dataset = dataset.map(tokenize_and_prepare_data, batched=True, remove_columns=[col for col in dataset.column_names if col not in ['premise', 'hypothesis', 'labels']])
    return dataset

# Step 2: Process the data
anli_r1 = process_anli_data(anli_r1)
anli_r2 = process_anli_data(anli_r2)
anli_r3 = process_anli_data(anli_r3)

Map:   0%|          | 0/3014 [00:00<?, ? examples/s]

In [135]:
# Step 3: Define predict function with metrics
torch.cuda.empty_cache()
gc.collect()

def predict_and_evaluate(dataset):
    with torch.inference_mode():
        predictions = trainer.evaluate(dataset)
    print("Precision:", predictions['eval_precision'])
    print("Recall:", predictions['eval_recall'])
    print("F1-score:", predictions['eval_f1-score'])
    print("Accuracy:", predictions['eval_accuracy'])

# Step 4: Run predictions and compute metrics for each ANLI round
print("Evaluating ANLI R1")
#predict_and_evaluate(anli_r1)

print("Evaluating ANLI R2")
#predict_and_evaluate(anli_r2)

print("Evaluating ANLI R3")
predict_and_evaluate(anli_r3)

Evaluating ANLI R1
Evaluating ANLI R2
Evaluating ANLI R3




Precision: 0.2387838400666389
Recall: 0.3942857142857143
F1-score: 0.2336518936518937
Accuracy: 0.3942857142857143


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
