# Exemple de réglage fin avec la stratégie LoRA sur données IMDB
Adapté de https://github.com/ShawhinT/YouTube-Blog/blob/main/LLMs/fine-tuning/ft-example.ipynb

In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
import peft
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


### Création du dataset avec 1000 exemples de IMDB

In [2]:
# Charger les données d'évaluation de films de imdb
dataset_imdb = load_dataset("imdb")
N = 1000 # taille de la sous-collection
np.random.seed(22)
indices_aleatoires = np.random.randint(24999, size=N)

# Extraire 1000 exemples
x_ent = dataset_imdb['train'][indices_aleatoires]['text']
y_ent = dataset_imdb['train'][indices_aleatoires]['label']

x_test = dataset_imdb['test'][indices_aleatoires]['text']
y_test = dataset_imdb['test'][indices_aleatoires]['label']

# Dataset pour la sous-collection
dataset = DatasetDict({'train':Dataset.from_dict({'label':y_ent,'text':x_ent}),
                      'test':Dataset.from_dict({'label':y_test,'text':x_test})})# load dataset

Downloading readme: 100%|██████████| 7.81k/7.81k [00:00<?, ?B/s]
Downloading data: 100%|██████████| 21.0M/21.0M [00:00<00:00, 40.7MB/s]
Downloading data: 100%|██████████| 20.5M/20.5M [00:00<00:00, 46.1MB/s]
Downloading data: 100%|██████████| 42.0M/42.0M [00:00<00:00, 53.7MB/s]
Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 173026.35 examples/s]
Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 242630.45 examples/s]
Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 261843.86 examples/s]


In [3]:
# pourcentage d'évaluations positives
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.506

In [26]:
dataset["train"][:5]

{'label': [0, 1, 1, 1, 1],
 'text': ["I purchased this video quite cheaply ex-rental, thinking that the cover looked quite nice. And it was nice, but the movie is trash. I can handle B-grade, I sometimes even enjoy a good B romp (ie. 'Surf Nazis Must Die' is a classic example of how entertaining the genre can be), but this was just bland bland bland. Incredibly dull scenes were broken up too sparsely by good wholesome cheap porn and entertaining dream horror sequences. This movie has very little to offer.",
  'What is contained on this disk is a first rate show by a first rate band. This disc is NOT for the faint of heart...the music is incredibly intense, and VERY cool. What you will learn when you watch this movie is just why the Who was so huge for so long. It is true that their records were great, but their shows were the top of the heap. In 1969 when this concert was shot, the screaming teenie boppers that threw jelly beans at the Beatles were gone and bands (and audiences) had se

### Création du modèle

In [4]:
nom_modele = 'distilbert-base-uncased'
# model_checkpoint = 'roberta-base' # you can alternatively use roberta-base but this model is bigger thus training will take longer

# Traduction des étiquettes de classe
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative":0, "Positive":1}
modele = AutoModelForSequenceClassification.from_pretrained(
    nom_modele, num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Vectorisation des textes

In [5]:
tokenizer = AutoTokenizer.from_pretrained(nom_modele)
def tokenize(donnees_texte):
    jetons = tokenizer(donnees_texte['text'], truncation=True)
    return jetons

ds_jetons = dataset.map(tokenize, batched=True)
ds_jetons

Map: 100%|██████████| 1000/1000 [00:00<00:00, 2447.91 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 2271.01 examples/s]


DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [6]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Métrique d'évaluation

In [10]:
accuracy = evaluate.load("accuracy")
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Test avant entrainement

In [12]:
liste_exemples = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]
print("Prédictions avant entrainement:")
for texte in liste_exemples:
    inputs = tokenizer.encode(texte, return_tensors="pt")
    # compute logits
    logits = modele(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)
    print(texte + " - " + id2label[predictions.tolist()])

Prédictions avant entrainement:
It was good. - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


### Entrainement avec LoRA

In [13]:
peft_config = LoraConfig(task_type="SEQ_CLS",r=4,lora_alpha=32,
              lora_dropout=0.01,target_modules = ['q_lin'])
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [17]:
modele = get_peft_model(modele, peft_config)
modele.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [19]:
training_args = TrainingArguments(
    output_dir= nom_modele + "-lora-classification",
    learning_rate=1e-3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)
trainer = Trainer(
    model=modele,
    args=training_args,
    train_dataset=ds_jetons["train"],
    eval_dataset=ds_jetons["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.707886,{'accuracy': 0.494}
2,0.736600,0.695795,{'accuracy': 0.494}
3,0.736600,0.697977,{'accuracy': 0.494}
4,0.727300,0.699477,{'accuracy': 0.494}
5,0.727300,0.700064,{'accuracy': 0.494}
6,0.700800,0.693188,{'accuracy': 0.494}
7,0.700800,0.69382,{'accuracy': 0.494}
8,0.695300,0.693261,{'accuracy': 0.506}
9,0.695300,0.693252,{'accuracy': 0.506}
10,0.694300,0.693115,{'accuracy': 0.506}


TrainOutput(global_step=2500, training_loss=0.7108740356445312, metrics={'train_runtime': 12830.4685, 'train_samples_per_second': 0.779, 'train_steps_per_second': 0.195, 'total_flos': 1111722294204960.0, 'train_loss': 0.7108740356445312, 'epoch': 10.0})

### Prédictions après entrainement

In [21]:
print("Prédictions après entrainement:")
for texte in liste_exemples:
    inputs = tokenizer.encode(texte, return_tensors="pt")
    logits = modele(inputs).logits
    predictions = torch.argmax(logits)
    print(texte + " - " + id2label[predictions.tolist()])

Prédictions après entrainement:
It was good. - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


In [None]:
model.to('cpu') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])


### Optional: push model to hub

In [None]:
# option 1: notebook login
from huggingface_hub import notebook_login
notebook_login() # ensure token gives write access

# # option 2: key login
# from huggingface_hub import login
# write_key = 'hf_' # paste token here
# login(write_key)

In [None]:
hf_name = 'shawhin' # your hf username or org name
model_id = hf_name + "/" + model_checkpoint + "-lora-text-classification" # you can name the model whatever you want

In [None]:
model.push_to_hub(model_id) # save model

In [None]:
trainer.push_to_hub(model_id) # save trainer

### Optional: load peft model

In [None]:
# how to load peft model from hub for inference
config = PeftConfig.from_pretrained(model_id)
inference_model = AutoModelForSequenceClassification.from_pretrained(
    config.base_model_name_or_path, num_labels=2, id2label=id2label, label2id=label2id
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(inference_model, model_id)