In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, MT5ForConditionalGeneration, MT5Tokenizer
import psutil
from transformers import TrainerCallback
import time

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
model_id = "lightblue/suzume-llama-3-8B-multilingual"
#model_id = "meta-llama/Meta-Llama-3-8B"

model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                                           quantization_config=bnb_config,
                                                           num_labels=3,
                                                           device_map={'':torch.cuda.current_device()},
                                                           #device_map="auto"
                                                          )
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at lightblue/suzume-llama-3-8B-multilingual and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from datasets import load_dataset, Dataset

# data = load_dataset("ankitkupadhyay/XNLI")["train"]
data = load_dataset("Gameselo/monolingual-wideNLI")

data = data.shuffle(seed=1234)  # Shuffle dataset here
data = data.rename_column("label", "labels")

In [6]:
# TRADUCTION SI XNLI

# Charger le modèle MT5 et le tokenizer
model_name = "google/mt5-base"
tokenizer_trad = MT5Tokenizer.from_pretrained(model_name)
model_trad = MT5ForConditionalGeneration.from_pretrained(model_name)

# Fonction pour traduire une phrase en anglais
def translate_to_english(sentence):
    # Préparer l'entrée pour le modèle
    input_text = f"translate to English: {sentence}"
    inputs = tokenizer_trad.encode(input_text, return_tensors="pt")

    # Générer la traduction
    with torch.inference_mode():
        outputs = model_trad.generate(inputs)
        
    translated_sentence = tokenizer_trad.decode(outputs[0], skip_special_tokens=True)
    
    return translated_sentence

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# IF USING XNLI

data = data.train_test_split(test_size=0.1)
train_data = data["train"]
test_data = data["test"]

In [5]:
# IF USING MONOLINGUAL-WIDENLI

train_data = data["train"]
test_data = data["dev"]

In [6]:
# Fonction pour sampler 8 exemples par classe
def sample_per_class(dataset, num_samples=16):
    # Obtenir un index unique pour chaque classe
    class_indices = {label: [] for label in set(dataset['labels'])}

    # Accumuler les indices pour chaque classe
    for index, label in enumerate(dataset['labels']):
        class_indices[label].append(index)

    # Sélectionner aléatoirement num_samples indices pour chaque classe
    import random
    sampled_indices = [index for label, indices in class_indices.items()
                       for index in random.sample(indices, num_samples)]

    # Créer un nouveau dataset à partir des indices échantillonnés
    sampled_dataset = dataset.select(sampled_indices)
    return sampled_dataset

# Appliquer la fonction
train_data = sample_per_class(train_data)

In [9]:
# IF WE WANT TO TRANSLATE

# Traduire les phrases de prémisse et d'hypothèse
translated_data = {
    "premise": [],
    "hypothesis": [],
    "labels": []
}

for example in train_data:
    premise = example["premise"]
    hypothesis = example["hypothesis"]

    translated_premise = translate_to_english(premise)
    translated_hypothesis = translate_to_english(hypothesis)

    translated_data["premise"].append(translated_premise)
    translated_data["hypothesis"].append(translated_hypothesis)
    translated_data["labels"].append(example["labels"])

# Créer un dataset de Hugging Face à partir des données traduites
train_data = Dataset.from_dict(translated_data)



In [7]:
def tokenize_and_prepare_data(examples):
    # Tokeniser chaque texte dans le batch
    text = f"Is this true? {examples['premise']} implies {examples['hypothesis']}"
    result = tokenizer(text,truncation=True,   
                       max_length=1000,
                       return_overflowing_tokens=True)
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

model.config.pad_token_id = model.config.eos_token_id

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# Apply the tokenization and preparation function
train_data = train_data.map(tokenize_and_prepare_data, batched=True)
test_data = test_data.map(tokenize_and_prepare_data, batched=True)

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

In [8]:
from peft import prepare_model_for_kbit_training

torch.cuda.empty_cache()

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=1024,
    lora_alpha=4096,
    lora_dropout=0.2,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)

peft_model = get_peft_model(model, lora_config)
print_trainable_parameters(peft_model)

trainable params: 436219904 || all params: 4451495936 || trainable%: 9.799400252670477


In [11]:
import evaluate
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred  # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    # Calculate metrics, assuming 'average' as 'weighted' for handling multiclass classification
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    
    # Return a dictionary with the computed metrics
    return {
        "precision": precision, 
        "recall": recall, 
        "f1-score": f1, 
        "accuracy": accuracy
    }

In [12]:
class PerformanceCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, **kwargs):
        self.start_time = time.time()
        self.start_memory = psutil.Process().memory_info().rss  # Mémoire en octets

    def on_evaluate_end(self, args, state, control, **kwargs):
        elapsed_time = time.time() - self.start_time
        end_memory = psutil.Process().memory_info().rss
        memory_used = end_memory - self.start_memory
        print(f"Inference time: {elapsed_time:.2f} seconds")
        print(f"Memory used: {memory_used / (1024 ** 2):.2f} Mo")  # Convertir en Mo

In [13]:
#new code using SFTTrainer
import transformers

from trl import SFTTrainer

torch.cuda.empty_cache()

# Configuration des arguments de l'entraînement
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,
    warmup_ratio=0.08,
    learning_rate=1e-4,
    output_dir="outputs",
    optim="paged_adamw_8bit",
    save_strategy="no",
    # evaluation_strategy="epoch",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=lora_config,
    args=training_args,
    max_seq_length=1024,
    data_collator=transformers.DataCollatorWithPadding(tokenizer),
    formatting_func=tokenize_and_prepare_data,
    compute_metrics=compute_metrics,
    callbacks=[PerformanceCallback()]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()
trainer.save_model('./best_model_llama')



Step,Training Loss


## Inference

In [None]:
model.config.use_cache = True

In [59]:
# Step 1: Load ANLI test rounds
anli_r1 = load_dataset("anli", split="test_r1")
anli_r2 = load_dataset("anli", split="test_r2")
anli_r3 = load_dataset("anli", split="test_r3")

In [60]:
# Function to process the ANLI dataset rounds
def process_anli_data(dataset):
    # Tokenize the data
    dataset = dataset.rename_column("label", "labels")
    dataset = dataset.map(tokenize_and_prepare_data, batched=True, remove_columns=[col for col in dataset.column_names if col not in ['premise', 'hypothesis', 'labels']])
    return dataset

In [61]:
# Step 2: Process the data
anli_r1 = process_anli_data(anli_r1)
anli_r2 = process_anli_data(anli_r2)
anli_r3 = process_anli_data(anli_r3)

In [62]:
# Step 3: Define predict function with metrics
def predict_and_evaluate(dataset):
    with torch.inference_mode():
        predictions = trainer.evaluate(dataset)
    print("Precision:", predictions['eval_precision'])
    print("Recall:", predictions['eval_recall'])
    print("F1-score:", predictions['eval_f1-score'])
    print("Accuracy:", predictions['eval_accuracy'])

In [63]:
torch.cuda.empty_cache()

# Step 4: Run predictions and compute metrics for each ANLI round
print("Evaluating ANLI R1")
predict_and_evaluate(anli_r1)

print("Evaluating ANLI R2")
predict_and_evaluate(anli_r2)

print("Evaluating ANLI R3")
predict_and_evaluate(anli_r3)

Evaluating ANLI R1


Precision: 1.0
Recall: 1.0
F1-score: 1.0
Accuracy: 1.0
Evaluating ANLI R2




Precision: 1.0
Recall: 1.0
F1-score: 1.0
Accuracy: 1.0
Evaluating ANLI R3




Precision: 1.0
Recall: 1.0
F1-score: 1.0
Accuracy: 1.0


In [64]:
test_vitaminc = load_dataset("Gameselo/monolingual-wideNLI", split="test_vitaminc")
test_vitaminc = process_anli_data(test_vitaminc)

# Step 3: Define predict function with metrics
torch.cuda.empty_cache()

# Step 4: Run predictions and compute metrics for each ANLI round
print("Evaluating VitaminC")
predict_and_evaluate(test_vitaminc)

Evaluating VitaminC




Precision: 0.29020811654526535
Recall: 0.5387096774193548
F1-score: 0.37720971123283964
Accuracy: 0.5387096774193548


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
