In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
model_id = "lightblue/suzume-llama-3-8B-multilingual"

model = AutoModelForSequenceClassification.from_pretrained(model_id, quantization_config=bnb_config, num_labels=3, device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at lightblue/suzume-llama-3-8B-multilingual and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from datasets import load_dataset

data = load_dataset("ankitkupadhyay/XNLI")["train"]

data = data.shuffle(seed=1234)  # Shuffle dataset here
data = data.rename_column("label", "labels")

# Explore the data
df = data.to_pandas()
df.head(10)

Unnamed: 0,premise,hypothesis,labels
0,'Блискам ли? ',Тогава бях много емоционален.,1
1,Matumizi zaidi!,Matumizi hayana manufaa katika hali hii.,2
2,Най-очарователната структура на Калтън Хил е Н...,Националният паметник е най-голямата структура...,0
3,Vigezo vya News ' vinaweza kutoa mzozo kama huu.,Vigezo vipya vinaweza kuleta mzozo.,0
4,"Helen Gurley Brown, sobre los muchos aspectos ...",Helen Gurley Brown cree que el acoso sexual es...,2
5,เขาเขียนถึงฉันในอีเมล ฉันเชื่อว่าหลักฐานของ To...,อีเมลได้รวมรายละเอียดของหลักฐาน,1
6,Các nhân vật chính nhận được nhiều sự điều trị...,Thật kỳ lạ khi cả nhân vật chính và phụ đều đư...,0
7,一楼是法兰西历史博物馆，珍藏着她一生中唯一已知的圣女贞德肖像和路易十六保存的日记等珍品。,圣女贞德的画像在五楼。,2
8,اس معاملے میں، اس کا بوڑھا دھوکہ دینے والا اس ...,Ginsburg نے اس کی نمائندگی کے باوجود اسے دھوکہ...,1
9,Ngài James gõ bàn một cách khá sốt ruột.,Ngài James đã mất kiên nhẫn.,0


In [5]:
data = data.train_test_split(test_size=0.1)
train_data = data["train"]
test_data = data["test"]

In [6]:
# Fonction pour sampler 8 exemples par classe
def sample_per_class(dataset, num_samples=8):
    # Obtenir un index unique pour chaque classe
    class_indices = {label: [] for label in set(dataset['labels'])}

    # Accumuler les indices pour chaque classe
    for index, label in enumerate(dataset['labels']):
        class_indices[label].append(index)

    # Sélectionner aléatoirement num_samples indices pour chaque classe
    import random
    sampled_indices = [index for label, indices in class_indices.items()
                       for index in random.sample(indices, num_samples)]

    # Créer un nouveau dataset à partir des indices échantillonnés
    sampled_dataset = dataset.select(sampled_indices)
    return sampled_dataset

# Appliquer la fonction
train_data = sample_per_class(train_data)

In [7]:
def tokenize_and_prepare_data(examples):
    # Tokeniser chaque texte dans le batch
    text = f"Is this true? {examples['premise']} implies {examples['hypothesis']}"
    result = tokenizer(text,truncation=True,   
                       max_length=1000,
                       return_overflowing_tokens=True)
    sample_map = result.pop("overflow_to_sample_mapping")
    for key, values in examples.items():
        result[key] = [values[i] for i in sample_map]
    return result

model.config.pad_token_id = model.config.eos_token_id

tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

# Apply the tokenization and preparation function
train_data = train_data.map(tokenize_and_prepare_data, batched=True)
test_data = test_data.map(tokenize_and_prepare_data, batched=True)

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

Map:   0%|          | 0/589053 [00:00<?, ? examples/s]

In [8]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [9]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=8,
    lora_alpha=24,
    lora_dropout=0.15,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)

peft_model = get_peft_model(model, lora_config)
print_trainable_parameters(peft_model)

trainable params: 3420160 || all params: 4018696192 || trainable%: 0.08510620949173757


In [14]:
import evaluate
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred  # eval_pred is the tuple of predictions and labels returned by the model
    predictions = np.argmax(logits, axis=-1)
    
    # Calculate metrics, assuming 'average' as 'weighted' for handling multiclass classification
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    accuracy = accuracy_score(labels, predictions)
    
    # Return a dictionary with the computed metrics
    return {
        "precision": precision, 
        "recall": recall, 
        "f1-score": f1, 
        "accuracy": accuracy
    }

In [18]:
#new code using SFTTrainer
import transformers

from trl import SFTTrainer

torch.cuda.empty_cache()

# Configuration des arguments de l'entraînement
training_args = transformers.TrainingArguments(
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_ratio=0.08,
    learning_rate=1e-4,
    output_dir="outputs",
    optim="paged_adamw_8bit",
    save_strategy="no",
    evaluation_strategy="epoch",
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=lora_config,
    args=training_args,
    max_seq_length=1024,
    data_collator=transformers.DataCollatorWithPadding(tokenizer),
    formatting_func=tokenize_and_prepare_data,
    compute_metrics=compute_metrics
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [19]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()



Step,Training Loss


TrainOutput(global_step=3, training_loss=0.9407986005147299, metrics={'train_runtime': 24.6741, 'train_samples_per_second': 0.243, 'train_steps_per_second': 0.122, 'total_flos': 251388739584000.0, 'train_loss': 0.9407986005147299, 'epoch': 3.0})

## Inference

In [20]:
model.config.use_cache = True

In [21]:
# Step 1: Load ANLI test rounds
anli_r1 = load_dataset("anli", split="train_r1[:10%]")
anli_r2 = load_dataset("anli", split="train_r2[:10%]")
anli_r3 = load_dataset("anli", split="train_r3[:10%]")

In [22]:
# Function to process the ANLI dataset rounds
def process_anli_data(dataset):
    # Tokenize the data
    dataset = dataset.rename_column("label", "labels")
    dataset = dataset.map(tokenize_and_prepare_data, batched=True, remove_columns=[col for col in dataset.column_names if col not in ['premise', 'hypothesis', 'labels']])
    return dataset

# Step 2: Process the data
anli_r1 = process_anli_data(anli_r1)
anli_r2 = process_anli_data(anli_r2)
anli_r3 = process_anli_data(anli_r3)

Map:   0%|          | 0/1695 [00:00<?, ? examples/s]

Map:   0%|          | 0/4546 [00:00<?, ? examples/s]

Map:   0%|          | 0/10046 [00:00<?, ? examples/s]

In [25]:
# Step 3: Define predict function with metrics
def predict_and_evaluate(dataset):
    predictions = trainer.evaluate(dataset)
    print("Precision:", predictions['eval_precision'])
    print("Recall:", predictions['eval_recall'])
    print("F1-score:", predictions['eval_f1-score'])
    print("Accuracy:", predictions['eval_accuracy'])

# Step 4: Run predictions and compute metrics for each ANLI round
print("Evaluating ANLI R1")
predict_and_evaluate(anli_r1)

print("Evaluating ANLI R2")
predict_and_evaluate(anli_r2)

print("Evaluating ANLI R3")
predict_and_evaluate(anli_r3)

Evaluating ANLI R1


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.5415686495386853
Recall: 0.4658385093167702
F1-score: 0.49771965276113617
Accuracy: 0.4658385093167702
Evaluating ANLI R2


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Precision: 0.6721870174130677
Recall: 0.44075829383886256
F1-score: 0.5057540165720672
Accuracy: 0.44075829383886256
Evaluating ANLI R3




Precision: 0.36966582017785765
Recall: 0.35599078341013823
F1-score: 0.36202093284097037
Accuracy: 0.35599078341013823
