<a href="https://colab.research.google.com/github/RobertGodin/CodePython/blob/master/Notebooks/LLM/ExempleReglageFinDistilBertTweetLORA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Installation des bibliothèques
%pip install numpy
%pip install pandas
%pip install transformers
%pip install datasets
%pip install peft
%pip install evaluate
%pip install torch
%pip install huggingface_hub
%pip install accelerate



In [10]:
# Chargement des données de tweet
nom_dataset = "mteb/tweet_sentiment_extraction"
import pandas as pd
from datasets import load_dataset
dataset_tweet = load_dataset(nom_dataset)
print(dataset_tweet)
df_ent = pd.DataFrame(dataset_tweet["train"])
df_ent.head(3)

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 3534
    })
})


Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative


In [11]:
# Chargement du modèle et tokenizer distilbert
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
nom_modele = 'distilbert-base-uncased'
# Créer un modèle pour la classification à partir de distilbert
# une nouvelle couche est ajoutée en sortie avec le nombre de classe = num_labels
modele = AutoModelForSequenceClassification.from_pretrained(nom_modele, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(nom_modele)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Vectorisation des textes pour l'entrainement
def tokenize_text(data):
  return tokenizer(data["text"], padding = "max_length",truncation=True)
dataset_vectorise = dataset_tweet.map(tokenize_text, batched=True)
print("Dataset vectorisé:",dataset_vectorise)
dataset_ent = dataset_vectorise["train"].shuffle(seed=42).select(range(500))  #petite collection
dataset_test = dataset_vectorise["test"].shuffle(seed=42).select(range(500))
df_ent=pd.DataFrame(dataset_ent)
df_ent.head(3)

Dataset vectorisé: DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'],
        num_rows: 3534
    })
})


Unnamed: 0,id,text,label,label_text,input_ids,attention_mask
0,fd8e3ede29,what happen to the maids of yours? Quit once ...,1,neutral,"[101, 2054, 4148, 2000, 1996, 29229, 1997, 673...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,9206b5221d,Listening to the commentary track on Holiday I...,2,positive,"[101, 5962, 2000, 1996, 8570, 2650, 2006, 6209...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,d6b51b539f,"chatting with some of my old classmates, helly...",1,neutral,"[101, 22331, 2007, 2070, 1997, 2026, 2214, 198...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [13]:
# Quelques exemples de prédictions avant entrainement
print("Predictions avant entrainement:")
print("--------------------------------------------------")
modele.to("cpu")
import torch
def predictions(liste_exemples):
  classe_etiquettes = {0: "negative", 1: "neutral", 2: "positive"}
  for texte in liste_exemples :
    entrees = tokenizer.encode(texte, return_tensors="pt")
    logits = modele(entrees).logits
    predictions = torch.argmax(logits)
    sentiment = classe_etiquettes[predictions.item()]
    print("Analyse de sentiment pour '",texte,"' est :", sentiment)
liste_exemples = ["This is a very good comment.", "I don't recommend this restaurant.",
                   "This is better that most !", "Don't ever go there !", "This is a good and bad thing."]
predictions(liste_exemples)

Predictions avant entrainement:
--------------------------------------------------
Analyse de sentiment pour ' This is a very good comment. ' est : negative
Analyse de sentiment pour ' I don't recommend this restaurant. ' est : negative
Analyse de sentiment pour ' This is better that most ! ' est : negative
Analyse de sentiment pour ' Don't ever go there ! ' est : negative
Analyse de sentiment pour ' This is a good and bad thing. ' est : negative


In [14]:
import peft
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
peft_config = LoraConfig(task_type="SEQ_CLS",
                        r=4,
                        lora_alpha=32,
                        lora_dropout=0.01,
                        target_modules = ['q_lin'])
peft_config
modele = get_peft_model(modele, peft_config)
modele.print_trainable_parameters()

trainable params: 629,763 || all params: 67,585,542 || trainable%: 0.9318


In [15]:
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
arguments_training = TrainingArguments(
    output_dir='dossier_bert_ft_tweet', # Dossier pour les résultats
    evaluation_strategy="epoch",
    num_train_epochs=10, # Défaut est 3
    learning_rate=2e-5, # Taux d'apprentissage
    per_device_train_batch_size=4, # Taille des mini-lots
    per_device_eval_batch_size=4,
    weight_decay=0.01 # Pour régularisation L2
)
import evaluate
import numpy as np
exactitude = evaluate.load("accuracy")
def calcul_metriques(p):
    logits, etiquettes = p
    predictions = np.argmax(logits, axis=1)
    return exactitude.compute(predictions=predictions, references=etiquettes)

data_collator = DataCollatorWithPadding(tokenizer)
entraineur = Trainer(
    model=modele,
    args=arguments_training,
    train_dataset=dataset_ent,
    eval_dataset=dataset_test,
    data_collator=data_collator, # pour lots de taille fixe dynamiques
    compute_metrics=calcul_metriques
)
entraineur.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Les prédictions après entrainement pour les exemples
print("Predictions après entrainement:")
print("--------------------------------------------------")
modele.to("cpu")
predictions(liste_exemples)

Analyse de sentiment pour 'This is a very nice tweet' est:  neutral


In [None]:
modele

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): MultiHeadSelfAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.01, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=76

In [None]:
print("Analyse de sentiment pour 'This is a very nice tweet' est: ", analyse_sentiment('This is a very nice tweet'))

Analyse de sentiment pour 'This is a very nice tweet' est:  neutral
