In [47]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, MT5ForConditionalGeneration, MT5Tokenizer
import psutil
from transformers import TrainerCallback
import time

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [48]:
model_id = "lightblue/suzume-llama-3-8B-multilingual"
#model_id = "meta-llama/Meta-Llama-3-8B"

model = AutoModelForSequenceClassification.from_pretrained(model_id,
                                                           quantization_config=bnb_config,
                                                           num_labels=3,
                                                           device_map="auto",
                                                           #device_map="auto"
                                                          )
model.config.pad_token_id = model.config.eos_token_id

tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True, load_in_4bit=True)

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at lightblue/suzume-llama-3-8B-multilingual and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [49]:
from datasets import load_dataset, Dataset

# data = load_dataset("ankitkupadhyay/XNLI")["train"]
data = load_dataset("Gameselo/monolingual-wideNLI")

data = data.shuffle(seed=1234)  # Shuffle dataset here
data = data.rename_column("label", "labels")

In [6]:
# TRADUCTION SI XNLI

# Charger le modèle MT5 et le tokenizer
model_name = "google/mt5-base"
tokenizer_trad = MT5Tokenizer.from_pretrained(model_name)
model_trad = MT5ForConditionalGeneration.from_pretrained(model_name)

# Fonction pour traduire une phrase en anglais
def translate_to_english(sentence):
    # Préparer l'entrée pour le modèle
    input_text = f"translate to English: {sentence}"
    inputs = tokenizer_trad.encode(input_text, return_tensors="pt")

    # Générer la traduction
    with torch.inference_mode():
        outputs = model_trad.generate(inputs)
        
    translated_sentence = tokenizer_trad.decode(outputs[0], skip_special_tokens=True)
    
    return translated_sentence

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:
# IF USING XNLI

data = data.train_test_split(test_size=0.1)
train_data = data["train"]
test_data = data["test"]

In [50]:
# IF USING MONOLINGUAL-WIDENLI

train_data = data["train"]
test_data = data["dev"]

In [51]:
# Fonction pour sampler 8 exemples par classe
def sample_per_class(dataset, num_samples=1024):
    # Obtenir un index unique pour chaque classe
    class_indices = {label: [] for label in set(dataset['labels'])}

    # Accumuler les indices pour chaque classe
    for index, label in enumerate(dataset['labels']):
        class_indices[label].append(index)

    # Sélectionner aléatoirement num_samples indices pour chaque classe
    import random
    sampled_indices = [index for label, indices in class_indices.items()
                       for index in random.sample(indices, num_samples)]

    # Créer un nouveau dataset à partir des indices échantillonnés
    sampled_dataset = dataset.select(sampled_indices)
    return sampled_dataset

# Appliquer la fonction
train_data = sample_per_class(train_data)

In [44]:
# IF WE WANT TO TRANSLATE

# Traduire les phrases de prémisse et d'hypothèse
translated_data = {
    "premise": [],
    "hypothesis": [],
    "labels": []
}

for example in train_data:
    premise = example["premise"]
    hypothesis = example["hypothesis"]

    translated_premise = translate_to_english(premise)
    translated_hypothesis = translate_to_english(hypothesis)

    translated_data["premise"].append(translated_premise)
    translated_data["hypothesis"].append(translated_hypothesis)
    translated_data["labels"].append(example["labels"])

# Créer un dataset de Hugging Face à partir des données traduites
train_data = Dataset.from_dict(translated_data)

NameError: name 'translate_to_english' is not defined

In [52]:
def tokenize_and_prepare_data(examples):
    # Tokeniser chaque texte dans le batch
    inputs = [f"Is this true? {premise} implies {hypothesis}" for premise, hypothesis in zip(examples['premise'], examples['hypothesis'])]
    result = tokenizer(inputs,
                       truncation=True,   
                       max_length=1000,
                       return_overflowing_tokens=True,
                       add_special_tokens=True)
    result["labels"] = examples["labels"]
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

# Apply the tokenization and preparation function
train_data = train_data.map(tokenize_and_prepare_data, batched=True)
test_data = test_data.map(tokenize_and_prepare_data, batched=True)

In [53]:
from peft import prepare_model_for_kbit_training

torch.cuda.empty_cache()

#model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [54]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_CLS,
    modules_to_save=['score']
)

In [55]:
import evaluate
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred

    # Apply the Softmax function to get probabilities
    probabilities = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()

    # Take the argmax of probabilities to get predictions
    predictions = np.argmax(probabilities, axis=-1)

    # Calculate metrics, assuming 'average' as 'weighted' for handling multiclass classification
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)

    # Calculate accuracy for each class
    class_accuracies = {}
    for cls in range(3):
        class_indices = (labels == cls)
        if np.sum(class_indices) > 0:
            class_accuracy = accuracy_score(labels[class_indices], predictions[class_indices])
        else:
            class_accuracy = float('nan')
        class_accuracies[f"accuracy_class_{cls}"] = class_accuracy

    # Return a dictionary with the computed metrics
    metrics = {
        "precision": precision,
        "recall": recall,
        "f1-score": f1,
        "accuracy": accuracy
    }

    # Add class-wise accuracies to the metrics dictionary
    metrics.update(class_accuracies)

    return metrics

In [56]:
#new code using SFTTrainer
import transformers
import gc

from trl import SFTTrainer

torch.cuda.empty_cache()
gc.collect()

# Configuration des arguments de l'entraînement
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    warmup_ratio=0.08,
    lr_scheduler_type= "constant_with_warmup",
    weight_decay=0.01,
    learning_rate=1e-4,
    output_dir="outputs",
    optim="paged_adamw_8bit",
    save_strategy="no",
    fp16=False,
    #evaluation_strategy="steps",
    num_train_epochs=5,
    logging_steps=100,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=lora_config,
    args=training_args,
    max_seq_length=1024,
    data_collator=transformers.DataCollatorWithPadding(tokenizer),
    formatting_func=tokenize_and_prepare_data,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [57]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



Step,Training Loss
100,2.1034
200,0.8107
300,0.54
400,0.433
500,0.3046
600,0.2509
700,0.1659
800,0.2023
900,0.1149


TrainOutput(global_step=960, training_loss=0.5213503157099088, metrics={'train_runtime': 6041.5517, 'train_samples_per_second': 2.542, 'train_steps_per_second': 0.159, 'total_flos': 6.788101538512896e+16, 'train_loss': 0.5213503157099088, 'epoch': 5.0})

## Inference

In [58]:
model.config.use_cache = True

In [59]:
# Step 1: Load ANLI test rounds
anli_r1 = load_dataset("anli", split="test_r1")
anli_r2 = load_dataset("anli", split="test_r2")
anli_r3 = load_dataset("anli", split="test_r3")

In [60]:
# Function to process the ANLI dataset rounds
def process_anli_data(dataset):
    # Tokenize the data
    dataset = dataset.rename_column("label", "labels")
    dataset = dataset.map(tokenize_and_prepare_data, batched=True, remove_columns=[col for col in dataset.column_names if col not in ['premise', 'hypothesis', 'labels']])
    return dataset

In [61]:
# Step 2: Process the data
anli_r1 = process_anli_data(anli_r1)
anli_r2 = process_anli_data(anli_r2)
anli_r3 = process_anli_data(anli_r3)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [62]:
# Step 3: Define predict function with metrics
def predict_and_evaluate(dataset):
    with torch.inference_mode():
        predictions = trainer.evaluate(dataset)
    print("Precision:", predictions['eval_precision'])
    print("Recall:", predictions['eval_recall'])
    print("F1-score:", predictions['eval_f1-score'])
    print("Accuracy:", predictions['eval_accuracy'])
    print("Accuracy entailment:", predictions['eval_accuracy_class_0'])
    print("Accuracy neutral:", predictions['eval_accuracy_class_1'])
    print("Accuracy contradiction:", predictions['eval_accuracy_class_2'])

In [63]:
torch.cuda.empty_cache()

# Step 4: Run predictions and compute metrics for each ANLI round
print("Evaluating ANLI R1")
predict_and_evaluate(anli_r1)

print("Evaluating ANLI R2")
predict_and_evaluate(anli_r2)

print("Evaluating ANLI R3")
predict_and_evaluate(anli_r3)

Evaluating ANLI R1


Precision: 0.6349865659483211
Recall: 0.623
F1-score: 0.6172665298566073
Accuracy: 0.623
Accuracy entailment: 0.6616766467065869
Accuracy neutral: 0.45645645645645644
Accuracy contradiction: 0.7507507507507507
Evaluating ANLI R2
Precision: 0.5146624039542324
Recall: 0.505
F1-score: 0.496012104240878
Accuracy: 0.505
Accuracy entailment: 0.5299401197604791
Accuracy neutral: 0.32732732732732733
Accuracy contradiction: 0.6576576576576577
Evaluating ANLI R3
Precision: 0.4766586999370505
Recall: 0.47583333333333333
F1-score: 0.47615858979722475
Accuracy: 0.47583333333333333
Accuracy entailment: 0.5174129353233831
Accuracy neutral: 0.417910447761194
Accuracy contradiction: 0.49242424242424243


In [64]:
test_vitaminc = load_dataset("Gameselo/monolingual-wideNLI", split="test_vitaminc")
test_vitaminc = process_anli_data(test_vitaminc)

# Step 3: Define predict function with metrics
torch.cuda.empty_cache()

# Step 4: Run predictions and compute metrics for each ANLI round
print("Evaluating VitaminC")
predict_and_evaluate(test_vitaminc)

Evaluating VitaminC
Precision: 0.7499514761462612
Recall: 0.7425724637681159
F1-score: 0.7434276393294816
Accuracy: 0.7425724637681159
Accuracy entailment: 0.749907646841522
Accuracy neutral: 0.5273224043715847
Accuracy contradiction: 0.8087457952907257


In [None]:
trainer.save_model('./best_model_llama')