<a href="https://colab.research.google.com/github/RobertGodin/CodePython/blob/master/ExempleReglageFinDistilBertTweet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installation des bibliothèques
%pip install numpy
%pip install pandas
%pip install transformers
%pip install datasets
%pip install evaluate
# %pip install --upgrade tensorflow==2.16.2
%pip install torch
%pip install huggingface_hub
%pip install accelerate



In [1]:
# Chargement des données de tweet
nom_dataset = "mteb/tweet_sentiment_extraction"
import pandas as pd
from datasets import load_dataset
dataset_tweet = load_dataset(nom_dataset)
print(dataset_tweet)
df_ent = pd.DataFrame(dataset_tweet["train"])
df_ent.head(3)

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 27481
    })
    test: Dataset({
        features: ['id', 'text', 'label', 'label_text'],
        num_rows: 3534
    })
})


Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative


In [2]:
# Chargement du modèle et tokenizer Bert de Google
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
nom_modele = 'distilbert-base-uncased'
# Créer un modèle pour la classification à partir de distilbert
# une nouvelle couche est ajoutée en sortie avec le nombre de classe = num_labels
modele = AutoModelForSequenceClassification.from_pretrained(nom_modele, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(nom_modele)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Vectorisation des textes pour l'entrainement
def tokenize_text(data):
  return tokenizer(data["text"], padding = "max_length",truncation=True)
dataset_vectorise = dataset_tweet.map(tokenize_text, batched=True)
dataset_ent = dataset_vectorise["train"].shuffle(seed=42).select(range(500))  #petite collection de 100
dataset_test = dataset_vectorise["test"].shuffle(seed=42).select(range(500))
df_ent=pd.DataFrame(dataset_ent)
df_ent.head(3)

Unnamed: 0,id,text,label,label_text,input_ids,attention_mask
0,fd8e3ede29,what happen to the maids of yours? Quit once ...,1,neutral,"[101, 2054, 4148, 2000, 1996, 29229, 1997, 673...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,9206b5221d,Listening to the commentary track on Holiday I...,2,positive,"[101, 5962, 2000, 1996, 8570, 2650, 2006, 6209...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,d6b51b539f,"chatting with some of my old classmates, helly...",1,neutral,"[101, 22331, 2007, 2070, 1997, 2026, 2214, 198...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [5]:
# Quelques exemples de prédictions avant entrainement
print("Predictions avant entrainement:")
print("--------------------------------------------------")
modele.to("cpu")
import torch
def predictions(liste_exemples):
  classe_etiquettes = {0: "negative", 1: "neutral", 2: "positive"}
  for texte in liste_exemples :
    entrees = tokenizer.encode(texte, return_tensors="pt")
    logits = modele(entrees).logits
    predictions = torch.argmax(logits)
    sentiment = classe_etiquettes[predictions.item()]
    print("Analyse de sentiment pour '",texte,"' est :", sentiment)
liste_exemples = ["This is a very good comment.", "I don't recommend this restaurant.", 
                   "This is better that most !", "Don't ever go there !", "This is a good and bad thing."]
predictions(liste_exemples)

Predictions avant entrainement:
--------------------------------------------------
Analyse de sentiment pour ' This is a very good comment. ' est : neutral
Analyse de sentiment pour ' I don't recommend this restaurant. ' est : neutral
Analyse de sentiment pour ' This is better that most ! ' est : neutral
Analyse de sentiment pour ' Don't ever go there ! ' est : neutral
Analyse de sentiment pour ' This is a good and bad thing. ' est : neutral


In [None]:
from transformers import Trainer, TrainingArguments
arguments_training = TrainingArguments(
    output_dir='dossier_bert_ft_tweet', # Dossier pour les résultats
    evaluation_strategy="epoch",

    num_train_epochs=10, # Défaut est 3
    learning_rate=2e-5, # Taux d'apprentissage
    per_device_train_batch_size=4, # Taille des mini-lots
    per_device_eval_batch_size=4,
    weight_decay=0.01, # Pour régularisation L2
)
import evaluate
import numpy as np
exactitude = evaluate.load("accuracy")
def calcul_metriques(p):
    logits, etiquettes = p
    predictions = np.argmax(logits, axis=1)
    return exactitude.compute(predictions=predictions, references=etiquettes)

entraineur = Trainer(
    model=modele,
    args=arguments_training,
    train_dataset=dataset_ent,
    eval_dataset=dataset_test,
    compute_metrics=calcul_metriques
)
entraineur.train()



Epoch,Training Loss,Validation Loss


In [None]:
# Les prédictions après entrainement pour les exemples
print("Predictions après entrainement:")
print("--------------------------------------------------")
modele.to("cpu")
predictions(liste_exemples)

The sentiment for 'how on earth can I analyse this?' is:  negative
