### Installation des packages nécessaires au projet

In [1]:
!pip install transformers==4.41.1
!pip install sentencepiece
!pip install accelerate
!pip install torch
!pip install datasets evaluate
!pip install -U bitsandbytes
!pip install huggingface_hub
!pip install peft==0.5.0

!pip install loguru
!pip install --upgrade peft
!pip install --upgrade transformers

Collecting transformers==4.41.1
  Downloading transformers-4.41.1-py3-none-any.whl.metadata (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.8/43.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers==4.41.1)
  Downloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m64.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.19.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
  

### Connexion HuggingFace

## Importez toutes les librairies nécessaires

In [3]:
import json
import re
import os
import random
import pandas as pd
import numpy as np
import torch
import datasets
import evaluate
from loguru import logger
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
from peft.utils import TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType

## Chargez le dataset analyse de sentiment depuis HuggingFace

In [4]:
id2label = {0:"negative", 1:"positive", 2:"neutral"}
label2id = {"negative":0, "positive":1, "neutral":2}
dataset = load_dataset("zeroshot/twitter-financial-news-sentiment")
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

sent_train.csv:   0%|          | 0.00/859k [00:00<?, ?B/s]

sent_valid.csv:   0%|          | 0.00/217k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9543 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2388 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 9543
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2388
    })
})

In [5]:
# Converter dataset to Pandas dataframe
def convert_to_pandas(dataset):
    dataset = dataset.to_pandas()
    #dataset['label'] = dataset['label'].apply(lambda x: dic[x])
    #dataset['instruction'] = "What is the sentiment of this tweet ? Please choose an answer from {negative/neutral/positive}"
    #dataset.columns = ['input', 'output', 'instruction']
    # shuffle dataset
    dataset = dataset.sample(frac=1).reset_index(drop=True)
    return dataset

In [6]:
dataset_train = convert_to_pandas(dataset['train'])
dataset_val = convert_to_pandas(dataset['validation'])
dataset_train.head()

Unnamed: 0,text,label
0,"$LITB - LightInTheBox Holding Co., Ltd. (LITB)...",2
1,$ENPH - Enphase: Promising Growth Vision. http...,1
2,"Benzinga Pro's Top 6 Stocks To Watch For Mon.,...",2
3,Walmart's Jet is ending its fresh-food deliver...,0
4,Corporate leverage all time high https://t.co/...,2


In [83]:
# Afficher les graphiques
import plotly.express as px

df = dataset_train.to_pandas()
df["labels_cat"] = df["label"].map(lambda x : id2label[x])

# Calculate percentages for each label
label_counts = df['labels_cat'].value_counts()
percentages = (label_counts / len(df)) * 100

fig = px.histogram(df, x="labels_cat", color="label", title="Distribution des labels dans le dataset train")
fig.update_layout(yaxis_title="value_counts")
fig.update_layout(bargap=0.2)
fig.update_layout(bargroupgap=0.1)

for i, v in enumerate(percentages):
    fig.add_annotation(x=label_counts.index[i], y=label_counts[i] - 150,
                       text=f"{v:.2f}%", showarrow=False, font=dict(size=12))
fig.show()


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



In [85]:
df_val = dataset_val.to_pandas()
df_val["labels_cat"] = df_val["label"].map(lambda x : id2label[x])

# Calculate percentages for each label
label_counts = df_val['labels_cat'].value_counts()
percentages = (label_counts / len(df_val)) * 100

fig = px.histogram(df_val, x="labels_cat", color="label", title="Distribution des labels dans le dataset val")
fig.update_layout(yaxis_title="value_counts")
fig.update_layout(bargap=0.2)
fig.update_layout(bargroupgap=0.1)

for i, v in enumerate(percentages):
    fig.add_annotation(x=label_counts.index[i], y=label_counts[i] - 35 ,
                       text=f"{v:.2f}%", showarrow=False, font=dict(size=12))
fig.show()


Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`



## Split les données de validation en dataset_val (50%) et en dataset test(50%)

In [7]:
# Create test dataset
dataset_val, dataset_test = dataset_val.iloc[:len(dataset_val) // 2], dataset_val.iloc[len(dataset_val) // 2:]
dataset_val.shape, dataset_test.shape

((1194, 2), (1194, 2))

In [8]:
# create directorie to store the all dataset
os.makedirs('financial_sentiment_dataset', exist_ok=True)

# Save the all datastes(train, val, test)
dataset_train.to_csv('financial_sentiment_dataset/train.csv', index=False)
dataset_val.to_csv('financial_sentiment_dataset/val.csv', index=False)
dataset_test.to_csv('financial_sentiment_dataset/test.csv', index=False)

In [9]:
dataset_train.isnull().sum()

Unnamed: 0,0
text,0
label,0


In [10]:
dataset_train["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,6178
1,1923
0,1442


In [11]:
# Reconvert Pandas dataframe to HuggingFace datasets
dataset_train = datasets.Dataset.from_pandas(dataset_train)
dataset_val = datasets.Dataset.from_pandas(dataset_val)
dataset_test = datasets.Dataset.from_pandas(dataset_test)
dataset_train

Dataset({
    features: ['text', 'label'],
    num_rows: 9543
})

##  Chargez le modèle Pré-entrainé de BERT et le AutoTokenizer

In [12]:
model_name = "bert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
def tokenize_function(examples):
    # Extract the inputs from the examples
    inputs = examples["text"]
    # Tokenize and truncate text
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        inputs,
        truncation=True,
        max_length=512,
        return_tensors="np"
    )
    return tokenized_inputs

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [14]:
# Ajout pad token si none exists
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({"pad_token": "[PAD]"})
  model.resize_token_embeddings(len(tokenizer))

# tokenize training and validation dataset
tokenize_dataset_train = dataset_train.map(tokenize_function, batched=True)
tokenize_dataset_val = dataset_val.map(tokenize_function, batched=True)

Map:   0%|          | 0/9543 [00:00<?, ? examples/s]

Map:   0%|          | 0/1194 [00:00<?, ? examples/s]

In [15]:
# Evaluation métriques
accuracy = evaluate.load("accuracy")
# Créer une fonction d'évaluation
def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)
  return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [16]:
# créer un data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Fine-tune BERT

In [17]:
# Fine tuner avec LoRA(Low Rank Adaption
peft_config = LoraConfig(task_type="SEQ_CLS",
                         r=128,
                         lora_alpha=32,
                         lora_dropout=0.01,
                         target_modules=["query"])

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 2,361,603 || all params: 111,846,150 || trainable%: 2.1115


In [18]:
# Hyperparamétres
lr = 1e-3
batch_size = 4
num_epochs = 10

# Créer un training arugments
training_args = TrainingArguments(
    output_dir="./Fine-tune-Bert-base-uncased-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True)



In [19]:
# Creéer un objet trainer pour l'entrainement
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenize_dataset_train,
    eval_dataset=tokenize_dataset_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics)
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msaidaliosman925[0m ([33msaidaliosman925-les-crous[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6488,0.562129,{'accuracy': 0.7981574539363484}
2,0.6561,0.542896,{'accuracy': 0.7922948073701842}
3,0.6274,0.595415,{'accuracy': 0.8065326633165829}
4,0.6478,0.594359,{'accuracy': 0.8065326633165829}
5,0.6455,0.535506,{'accuracy': 0.8157453936348409}
6,0.5946,0.54925,{'accuracy': 0.8165829145728644}
7,0.5678,0.564009,{'accuracy': 0.8257956448911222}
8,0.5512,0.526669,{'accuracy': 0.8341708542713567}
9,0.4867,0.513777,{'accuracy': 0.8400335008375209}
10,0.4736,0.52009,{'accuracy': 0.8366834170854272}


Trainer is attempting to log a value of "{'accuracy': 0.7981574539363484}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.7922948073701842}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.8065326633165829}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.8065326633165829}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.8157453936348409}" o

TrainOutput(global_step=23860, training_loss=0.5922768739123156, metrics={'train_runtime': 1163.8532, 'train_samples_per_second': 81.995, 'train_steps_per_second': 20.501, 'total_flos': 2142177906953052.0, 'train_loss': 0.5922768739123156, 'epoch': 10.0})

# Evaluation du modèle

In [20]:
# Enregistrer le modèle
trainer.save_model("./drive/MyDrive/Fine-tuned-model_Bert-base-uncased-110M-sentiment_analysis")
#tokenizer.save_pretrained("./drive/MyDrive/Fine-tuned-model_Bert-base-uncased-110M-sentiment_analysis")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [22]:
dataset_encoded_test =  dataset_test.map(tokenize_function, batched=True, batch_size=None)

Map:   0%|          | 0/1194 [00:00<?, ? examples/s]

In [23]:
dataset_encoded_test

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1194
})

In [32]:
print(f"Text : {dataset_encoded_test['text'][0]}")
inputs = tokenizer(dataset_encoded_test["text"][0], return_tensors="pt").to(model.device) # Move inputs to the same device as the model
inputs

Text : Heard on the Street: Dubai shows up Boeing’s wide-body woes https://t.co/Jla8Ap1CJm


{'input_ids': tensor([[  101,  2657,  2006,  1996,  2395,  1024, 11558,  3065,  2039, 10321,
          1521,  1055,  2898,  1011,  2303, 24185,  2229, 16770,  1024,  1013,
          1013,  1056,  1012,  2522,  1013,  1046,  2721,  2620,  9331,  2487,
          2278, 24703,   102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [49]:
predictions = []
for i in range(len(dataset_encoded_test)):
  inputs = tokenizer(dataset_encoded_test["text"][i], return_tensors="pt").to(model.device) # Move inputs to the same device as the model

  outputs = model(**inputs)

  predicted_class = outputs.logits.argmax().item()
  predictions.append(predicted_class)

In [51]:
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
print(f"Accuracy : {accuracy_score(dataset_encoded_test['label'], predictions)}")
print(f"Precision : {precision_score(dataset_encoded_test['label'], predictions, average='macro')}")
print(f"Recall : {recall_score(dataset_encoded_test['label'], predictions, average='macro')}")
print(f"F1-score : {f1_score(dataset_encoded_test['label'], predictions, average='macro')}")

Accuracy : 0.8534338358458962
Precision : 0.8102353880925371
Recall : 0.7866106652240181
F1-score : 0.7975269756236215
