<a href="https://colab.research.google.com/github/Tigropoil/SAE_S6/blob/main/create_model_bert_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

# Creation du modèle

In [3]:
# Charger les données
data_url = '/content/drive/MyDrive/SAE S6/data_fusion_sentiment.csv'
data = pd.read_csv(data_url)

In [4]:
# Prendre un set de 50000 lignes
data = data.sample(n=50000, random_state=42)

In [5]:
# Sélection des colonnes pertinentes
columns_to_keep = ['revue/texte', 'Sentiment']
data = data[columns_to_keep].dropna()

In [6]:
data['Sentiment'].unique()

array(['Positif', 'Negatif', 'Neutre'], dtype=object)

In [7]:
data

Unnamed: 0,revue/texte,Sentiment
31738,"As a mid-lifer, looking back over a 25 year sp...",Positif
46142,"This is ""The Song Of Roland,"" as Dorothy Sayer...",Positif
81412,I just sat down for a quiet night of reading s...,Negatif
97921,Just had to write a comment about this book. I...,Positif
63026,Daisetz Teitaro Suzuki (1870-1966) was a Japan...,Positif
...,...,...
89515,This book helped me beyond belief in studying ...,Positif
241620,This book was better than two other far more e...,Positif
50069,dr. hirsch's book is now 30 years old. he sort...,Positif
67475,"""The Final Crumpet"" is the second offering in ...",Positif


In [8]:
# transformer les valeurs de sentiment en -1 0 1
data['Sentiment'] = data['Sentiment'].map({'Negatif': 0, 'Neutre': 1, 'Positif': 2})

In [9]:
print(data)

                                              revue/texte  Sentiment
31738   As a mid-lifer, looking back over a 25 year sp...          2
46142   This is "The Song Of Roland," as Dorothy Sayer...          2
81412   I just sat down for a quiet night of reading s...          0
97921   Just had to write a comment about this book. I...          2
63026   Daisetz Teitaro Suzuki (1870-1966) was a Japan...          2
...                                                   ...        ...
89515   This book helped me beyond belief in studying ...          2
241620  This book was better than two other far more e...          2
50069   dr. hirsch's book is now 30 years old. he sort...          2
67475   "The Final Crumpet" is the second offering in ...          2
138193  to enjoy. The story is about young Argante, da...          2

[50000 rows x 2 columns]


In [10]:
# récupérer les labels
num_labels = data['Sentiment'].nunique()

In [11]:
num_labels

3

In [12]:
# Split des données
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("bert-base-uncased")
max_seq_len = 128

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


In [13]:
def tokenize_data(data, tokenizer, max_seq_len):
    input_ids, attention_masks, labels = [], [], []

    for _, row in tqdm(data.iterrows(), total=len(data)):
        if pd.isna(row['Sentiment']):
            continue

        encoded = tokenizer.encode_plus(
            row['revue/texte'],
            add_special_tokens=True,
            max_length=max_seq_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(int(row['Sentiment']))

    return (
        torch.tensor(input_ids, dtype=torch.long),
        torch.tensor(attention_masks, dtype=torch.long),
        torch.tensor(labels, dtype=torch.long)
    )

In [14]:
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_data, tokenizer, max_seq_len)
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_data, tokenizer, max_seq_len)

100%|██████████| 40000/40000 [01:59<00:00, 334.48it/s]
100%|██████████| 10000/10000 [00:29<00:00, 336.04it/s]


In [15]:
# DataLoader
batch_size = 128
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [16]:
# Modèle DistilBERT pour classification multi-classe
model = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn.lin2.bias', 'transformer.layer.0.ffn.lin2.weight', 'transformer.layer.0.output_layer_norm.bias', 'transformer.layer.0.output_layer_norm.weight', 'transformer.lay

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-11): 12 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False

In [17]:
# Optimizer et scheduler
num_epochs = 3
total_steps = len(train_dataloader) * num_epochs
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [18]:
# Fonction d'entraînement
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)

In [19]:
# Fonction d'évaluation
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.cpu().numpy()
        predictions.extend(logits.argmax(axis=-1))
        true_labels.extend(label_ids)

    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions, digits=4)


In [20]:
# Entraînement du modèle
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    val_accuracy, report = evaluate(model, val_dataloader, device)
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Loss: {train_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")
    print(report)

Training: 100%|██████████| 313/313 [02:45<00:00,  1.89it/s]
Evaluating: 100%|██████████| 79/79 [00:13<00:00,  5.94it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 1/3
Loss: 0.5403 - Validation Accuracy: 0.8432
              precision    recall  f1-score   support

           0     0.5245    0.1308    0.2094       818
           1     0.0000    0.0000    0.0000       802
           2     0.8498    0.9934    0.9160      8380

    accuracy                         0.8432     10000
   macro avg     0.4581    0.3747    0.3751     10000
weighted avg     0.7551    0.8432    0.7848     10000



Training: 100%|██████████| 313/313 [02:44<00:00,  1.91it/s]
Evaluating: 100%|██████████| 79/79 [00:13<00:00,  5.95it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 2/3
Loss: 0.4546 - Validation Accuracy: 0.8494
              precision    recall  f1-score   support

           0     0.5135    0.3264    0.3991       818
           1     0.0000    0.0000    0.0000       802
           2     0.8678    0.9817    0.9213      8380

    accuracy                         0.8494     10000
   macro avg     0.4604    0.4360    0.4401     10000
weighted avg     0.7692    0.8494    0.8047     10000



Training: 100%|██████████| 313/313 [02:44<00:00,  1.90it/s]
Evaluating: 100%|██████████| 79/79 [00:13<00:00,  5.95it/s]



Epoch 3/3
Loss: 0.4202 - Validation Accuracy: 0.8487
              precision    recall  f1-score   support

           0     0.4761    0.4010    0.4353       818
           1     0.0000    0.0000    0.0000       802
           2     0.8764    0.9736    0.9224      8380

    accuracy                         0.8487     10000
   macro avg     0.4508    0.4582    0.4526     10000
weighted avg     0.7733    0.8487    0.8086     10000



In [21]:
# Sauvegarde du modèle
model.save_pretrained("/content/drive/MyDrive/SAE S6//model_A100_sentiment")
tokenizer.save_pretrained("/content/drive/MyDrive/SAE S6//model_A100_sentiment")


('/content/drive/MyDrive/SAE S6//model_A100_sentiment/tokenizer_config.json',
 '/content/drive/MyDrive/SAE S6//model_A100_sentiment/special_tokens_map.json',
 '/content/drive/MyDrive/SAE S6//model_A100_sentiment/vocab.txt',
 '/content/drive/MyDrive/SAE S6//model_A100_sentiment/added_tokens.json')