<a href="https://colab.research.google.com/github/Tigropoil/SAE_S6/blob/Arthur/text_classification_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [78]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

In [101]:
# Charger les données
data_url = '/content/data_fusion_sample.csv'
data = pd.read_csv(data_url)

In [102]:
# Sélection des colonnes pertinentes
columns_to_keep = ['revue/texte', 'revue/score']
data = data[columns_to_keep].dropna()

In [103]:
# Convertir "revue/score" en classe catégorielle (recalage entre 0 et num_labels-1)
data['revue/score'] = data['revue/score'].astype(int) - 1
num_labels = data['revue/score'].nunique()

In [133]:
# Split des données
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("bert-base-uncased")
max_seq_len = 128

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


In [134]:
def tokenize_data(data, tokenizer, max_seq_len):
    input_ids, attention_masks, labels = [], [], []

    for _, row in tqdm(data.iterrows(), total=len(data)):
        encoded = tokenizer.encode_plus(
            row['revue/texte'],
            add_special_tokens=True,
            max_length=max_seq_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(row['revue/score'])

    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(labels)

In [135]:
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_data, tokenizer, max_seq_len)
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_data, tokenizer, max_seq_len)

100%|██████████| 1200/1200 [00:14<00:00, 80.92it/s] 
100%|██████████| 300/300 [00:02<00:00, 121.49it/s]


In [136]:
# DataLoader
batch_size = 24
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [137]:
# Modèle DistilBERT pour classification multi-classe
model = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn.lin2.b

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-11): 12 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False

In [138]:
# Optimizer et scheduler
num_epochs = 20
total_steps = len(train_dataloader) * num_epochs
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [139]:
# Fonction d'entraînement
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)

In [140]:
# Fonction d'évaluation
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.cpu().numpy()
        predictions.extend(logits.argmax(axis=-1))
        true_labels.extend(label_ids)

    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions, digits=4)


In [141]:
# Entraînement du modèle
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    val_accuracy, report = evaluate(model, val_dataloader, device)
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Loss: {train_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")
    print(report)

Training: 100%|██████████| 50/50 [00:23<00:00,  2.09it/s]
Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.33it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 1/10
Loss: 1.6229 - Validation Accuracy: 0.2000
              precision    recall  f1-score   support

           0     0.1992    0.9107    0.3269        56
           1     0.0000    0.0000    0.0000        65
           2     0.2250    0.1837    0.2022        49
           3     0.0000    0.0000    0.0000        66
           4     0.0000    0.0000    0.0000        64

    accuracy                         0.2000       300
   macro avg     0.0848    0.2189    0.1058       300
weighted avg     0.0739    0.2000    0.0941       300



Training: 100%|██████████| 50/50 [00:24<00:00,  2.03it/s]
Evaluating: 100%|██████████| 13/13 [00:02<00:00,  5.97it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 2/10
Loss: 1.6239 - Validation Accuracy: 0.1900
              precision    recall  f1-score   support

           0     0.2143    0.2679    0.2381        56
           1     0.0000    0.0000    0.0000        65
           2     0.1826    0.8571    0.3011        49
           3     0.0000    0.0000    0.0000        66
           4     0.0000    0.0000    0.0000        64

    accuracy                         0.1900       300
   macro avg     0.0794    0.2250    0.1078       300
weighted avg     0.0698    0.1900    0.0936       300



Training: 100%|██████████| 50/50 [00:24<00:00,  2.04it/s]
Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.36it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 3/10
Loss: 1.6145 - Validation Accuracy: 0.1800
              precision    recall  f1-score   support

           0     1.0000    0.0179    0.0351        56
           1     0.0000    0.0000    0.0000        65
           2     0.1713    1.0000    0.2925        49
           3     0.2857    0.0303    0.0548        66
           4     0.3333    0.0312    0.0571        64

    accuracy                         0.1800       300
   macro avg     0.3581    0.2159    0.0879       300
weighted avg     0.3486    0.1800    0.0786       300



Training: 100%|██████████| 50/50 [00:24<00:00,  2.06it/s]
Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.35it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 4/10
Loss: 1.6027 - Validation Accuracy: 0.1933
              precision    recall  f1-score   support

           0     0.2667    0.1429    0.1860        56
           1     0.0000    0.0000    0.0000        65
           2     0.1815    0.9592    0.3052        49
           3     0.0000    0.0000    0.0000        66
           4     0.2727    0.0469    0.0800        64

    accuracy                         0.1933       300
   macro avg     0.1442    0.2298    0.1142       300
weighted avg     0.1376    0.1933    0.1016       300



Training: 100%|██████████| 50/50 [00:24<00:00,  2.05it/s]
Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.28it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 5/10
Loss: 1.5829 - Validation Accuracy: 0.2333
              precision    recall  f1-score   support

           0     0.2308    0.3214    0.2687        56
           1     0.2284    0.6923    0.3435        65
           2     0.0000    0.0000    0.0000        49
           3     0.0000    0.0000    0.0000        66
           4     0.3684    0.1094    0.1687        64

    accuracy                         0.2333       300
   macro avg     0.1655    0.2246    0.1562       300
weighted avg     0.1712    0.2333    0.1606       300



Training: 100%|██████████| 50/50 [00:24<00:00,  2.05it/s]
Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.37it/s]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Epoch 6/10
Loss: 1.5768 - Validation Accuracy: 0.2533
              precision    recall  f1-score   support

           0     0.3182    0.3750    0.3443        56
           1     0.0000    0.0000    0.0000        65
           2     0.1847    0.5918    0.2816        49
           3     0.0000    0.0000    0.0000        66
           4     0.3611    0.4062    0.3824        64

    accuracy                         0.2533       300
   macro avg     0.1728    0.2746    0.2016       300
weighted avg     0.1666    0.2533    0.1918       300



Training: 100%|██████████| 50/50 [00:24<00:00,  2.06it/s]
Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.32it/s]



Epoch 7/10
Loss: 1.5537 - Validation Accuracy: 0.2367
              precision    recall  f1-score   support

           0     0.2812    0.1607    0.2045        56
           1     0.2410    0.3077    0.2703        65
           2     0.1746    0.4490    0.2514        49
           3     0.0000    0.0000    0.0000        66
           4     0.3636    0.3125    0.3361        64

    accuracy                         0.2367       300
   macro avg     0.2121    0.2460    0.2125       300
weighted avg     0.2108    0.2367    0.2095       300



Training: 100%|██████████| 50/50 [00:24<00:00,  2.05it/s]
Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.32it/s]



Epoch 8/10
Loss: 1.5266 - Validation Accuracy: 0.2733
              precision    recall  f1-score   support

           0     0.3585    0.3393    0.3486        56
           1     0.0000    0.0000    0.0000        65
           2     0.1825    0.5102    0.2688        49
           3     0.4000    0.0909    0.1481        66
           4     0.3404    0.5000    0.4051        64

    accuracy                         0.2733       300
   macro avg     0.2563    0.2881    0.2341       300
weighted avg     0.2573    0.2733    0.2280       300



Training: 100%|██████████| 50/50 [00:24<00:00,  2.05it/s]
Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.33it/s]



Epoch 9/10
Loss: 1.5137 - Validation Accuracy: 0.2233
              precision    recall  f1-score   support

           0     0.3095    0.2321    0.2653        56
           1     0.0625    0.0308    0.0412        65
           2     0.1688    0.5510    0.2584        49
           3     0.2500    0.0152    0.0286        66
           4     0.3871    0.3750    0.3810        64

    accuracy                         0.2233       300
   macro avg     0.2356    0.2408    0.1949       300
weighted avg     0.2365    0.2233    0.1882       300



Training: 100%|██████████| 50/50 [00:24<00:00,  2.06it/s]
Evaluating: 100%|██████████| 13/13 [00:02<00:00,  6.37it/s]


Epoch 10/10
Loss: 1.4843 - Validation Accuracy: 0.2400
              precision    recall  f1-score   support

           0     0.3088    0.3750    0.3387        56
           1     0.0000    0.0000    0.0000        65
           2     0.1812    0.5918    0.2775        49
           3     0.4286    0.0455    0.0822        66
           4     0.3393    0.2969    0.3167        64

    accuracy                         0.2400       300
   macro avg     0.2516    0.2618    0.2030       300
weighted avg     0.2539    0.2400    0.1942       300






In [113]:
# Sauvegarde du modèle
model.save_pretrained("./model/")
tokenizer.save_pretrained("./model/")


('./model/tokenizer_config.json',
 './model/special_tokens_map.json',
 './model/vocab.txt',
 './model/added_tokens.json')

In [92]:
# Fonction de prédiction
def predict_review(review, model, tokenizer, device):
    model.eval()
    encoded = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
    )
    input_id = torch.tensor([encoded["input_ids"]]).to(device)
    attention_mask = torch.tensor([encoded["attention_mask"]]).to(device)
    with torch.no_grad():
        outputs = model(input_id, attention_mask=attention_mask)
    logits = outputs.logits.detach().cpu().numpy()
    predicted_score = logits.argmax(axis=-1)[0] + 1  # Recaler le score pour correspondre à l'échelle originale
    return predicted_score

In [95]:
from sklearn.metrics import confusion_matrix, classification_report

In [114]:
# Charger le modèle et le tokenizer
model_path = "model"
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [97]:
# Fonction de prédiction
def predict_reviews(df, model, tokenizer, device):
    model.eval()
    predicted_scores = []

    for review in df['revue/texte']:
        encoded = tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
        )
        input_id = torch.tensor([encoded["input_ids"]]).to(device)
        attention_mask = torch.tensor([encoded["attention_mask"]]).to(device)

        with torch.no_grad():
            outputs = model(input_id, attention_mask=attention_mask)
        logits = outputs.logits.detach().cpu().numpy()
        predicted_score = logits.argmax(axis=-1)[0] + 1  # Recaler le score original
        predicted_scores.append(predicted_score)

    return predicted_scores

In [115]:
# Évaluation
def evaluate_model(df, model, tokenizer, device):
    df = df[['revue/texte', 'revue/score']].dropna()
    df['revue/score'] = df['revue/score'].astype(int)
    df['predicted_score'] = predict_reviews(df, model, tokenizer, device)

    # Calcul des métriques
    cm = confusion_matrix(df['revue/score'], df['predicted_score'])
    report = classification_report(df['revue/score'], df['predicted_score'], digits=4)

    return cm, report, df

In [99]:
# Import du jeu de données
data_url = '/content/drive/MyDrive/SAE S6/data_fusion_little.csv'
df = pd.read_csv(data_url)

# Ne garder que les colonnes nécessaires
df = df[['revue/texte', 'revue/score']]
df = df.dropna()

# Ne garder que 100 lignes au hasard
df = df.sample(n=10000, random_state=42)

df_test = pd.DataFrame(df)



In [116]:
conf_matrix, class_report, df_results = evaluate_model(df_test, model, tokenizer, device)

# Affichage des résultats
print("Matrice de confusion :\n", conf_matrix)
print("Rapport de classification :\n", class_report)

Matrice de confusion :
 [[ 358   86   43   12   17]
 [ 128  107   85   33   21]
 [ 108  158  227  129  122]
 [  90  119  433  505  997]
 [ 204  151  556  792 4519]]
Rapport de classification :
               precision    recall  f1-score   support

           1     0.4032    0.6938    0.5100       516
           2     0.1723    0.2861    0.2151       374
           3     0.1689    0.3051    0.2174       744
           4     0.3433    0.2355    0.2794      2144
           5     0.7962    0.7263    0.7596      6222

    accuracy                         0.5716     10000
   macro avg     0.3768    0.4494    0.3963     10000
weighted avg     0.6088    0.5716    0.5831     10000

