<a href="https://colab.research.google.com/github/Tigropoil/SAE_S6/blob/Arthur/text_classification_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

In [6]:
# Charger les données
data_url = '/content/drive/MyDrive/SAE S6/data_fusion.csv'
data = pd.read_csv(data_url)

In [7]:
# Sélection des colonnes pertinentes
columns_to_keep = ['revue/texte', 'revue/score']
data = data[columns_to_keep].dropna()

In [8]:
# Convertir "revue/score" en classe catégorielle (recalage entre 0 et num_labels-1)
data['revue/score'] = data['revue/score'].astype(int) - 1
num_labels = data['revue/score'].nunique()

In [9]:
# Split des données
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
max_seq_len = 128

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [10]:
def tokenize_data(data, tokenizer, max_seq_len):
    input_ids, attention_masks, labels = [], [], []

    for _, row in tqdm(data.iterrows(), total=len(data)):
        encoded = tokenizer.encode_plus(
            row['revue/texte'],
            add_special_tokens=True,
            max_length=max_seq_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(row['revue/score'])

    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(labels)

In [11]:
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_data, tokenizer, max_seq_len)
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_data, tokenizer, max_seq_len)

100%|██████████| 287952/287952 [15:19<00:00, 313.13it/s]
100%|██████████| 71988/71988 [03:45<00:00, 319.77it/s]


In [12]:
# DataLoader
batch_size = 16
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [13]:
# Modèle DistilBERT pour classification multi-classe
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [14]:
# Optimizer et scheduler
num_epochs = 3
total_steps = len(train_dataloader) * num_epochs
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [15]:
# Fonction d'entraînement
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)

In [16]:
# Fonction d'évaluation
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.cpu().numpy()
        predictions.extend(logits.argmax(axis=-1))
        true_labels.extend(label_ids)

    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions, digits=4)


In [17]:
# Entraînement du modèle
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    val_accuracy, report = evaluate(model, val_dataloader, device)
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Loss: {train_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")
    print(report)

Training: 100%|██████████| 17997/17997 [51:44<00:00,  5.80it/s]
Evaluating: 100%|██████████| 4500/4500 [04:13<00:00, 17.75it/s]



Epoch 1/3
Loss: 0.7510 - Validation Accuracy: 0.7125
              precision    recall  f1-score   support

           0     0.6370    0.6555    0.6461      3451
           1     0.3960    0.3598    0.3770      2793
           2     0.4699    0.3213    0.3817      5583
           3     0.4962    0.2945    0.3697     15023
           4     0.7863    0.9261    0.8505     45138

    accuracy                         0.7125     71988
   macro avg     0.5571    0.5115    0.5250     71988
weighted avg     0.6789    0.7125    0.6856     71988



Training: 100%|██████████| 17997/17997 [51:47<00:00,  5.79it/s]
Evaluating: 100%|██████████| 4500/4500 [04:14<00:00, 17.71it/s]



Epoch 2/3
Loss: 0.6440 - Validation Accuracy: 0.7193
              precision    recall  f1-score   support

           0     0.6470    0.6656    0.6562      3451
           1     0.4497    0.3040    0.3627      2793
           2     0.4855    0.3541    0.4095      5583
           3     0.5064    0.3198    0.3920     15023
           4     0.7899    0.9273    0.8531     45138

    accuracy                         0.7193     71988
   macro avg     0.5757    0.5142    0.5347     71988
weighted avg     0.6871    0.7193    0.6940     71988



Training: 100%|██████████| 17997/17997 [51:45<00:00,  5.79it/s]
Evaluating: 100%|██████████| 4500/4500 [04:13<00:00, 17.76it/s]



Epoch 3/3
Loss: 0.5501 - Validation Accuracy: 0.7173
              precision    recall  f1-score   support

           0     0.6778    0.6152    0.6450      3451
           1     0.4350    0.3555    0.3913      2793
           2     0.4771    0.3718    0.4180      5583
           3     0.4917    0.3600    0.4157     15023
           4     0.8011    0.9091    0.8517     45138

    accuracy                         0.7173     71988
   macro avg     0.5766    0.5223    0.5443     71988
weighted avg     0.6913    0.7173    0.6993     71988



In [18]:
# Sauvegarde du modèle
model.save_pretrained("./model/")
tokenizer.save_pretrained("./model/")


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [19]:
# Fonction de prédiction
def predict_review(review, model, tokenizer, device):
    model.eval()
    encoded = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
    )
    input_id = torch.tensor([encoded["input_ids"]]).to(device)
    attention_mask = torch.tensor([encoded["attention_mask"]]).to(device)
    with torch.no_grad():
        outputs = model(input_id, attention_mask=attention_mask)
    logits = outputs.logits.detach().cpu().numpy()
    predicted_score = logits.argmax(axis=-1)[0] + 1  # Recaler le score pour correspondre à l'échelle originale
    return predicted_score

In [27]:
# Exemple de test
review_test = "That was awesome. I great book for people who loves shitty books"
predicted_score = predict_review(review_test, model, tokenizer, device)
print(f"Score prédit : {predicted_score}")

Score prédit : 5
