<a href="https://colab.research.google.com/github/Tigropoil/SAE_S6/blob/Arthur/text_classification_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from tqdm import tqdm

In [None]:
# Charger les données
data_url = '/content/drive/MyDrive/SAE S6/data_fusion_little.csv'
data = pd.read_csv(data_url)

In [None]:
# Sélection des colonnes pertinentes
columns_to_keep = ['revue/texte', 'revue/score']
data = data[columns_to_keep].dropna()

In [None]:
# Convertir "revue/score" en classe catégorielle (recalage entre 0 et num_labels-1)
data['revue/score'] = data['revue/score'].astype(int) - 1
num_labels = data['revue/score'].nunique()

In [None]:
# Split des données
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("bert-base-uncased")
max_seq_len = 128

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


In [None]:
def tokenize_data(data, tokenizer, max_seq_len):
    input_ids, attention_masks, labels = [], [], []

    for _, row in tqdm(data.iterrows(), total=len(data)):
        encoded = tokenizer.encode_plus(
            row['revue/texte'],
            add_special_tokens=True,
            max_length=max_seq_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
        )

        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
        labels.append(row['revue/score'])

    return torch.tensor(input_ids), torch.tensor(attention_masks), torch.tensor(labels)

In [None]:
train_input_ids, train_attention_masks, train_labels = tokenize_data(train_data, tokenizer, max_seq_len)
val_input_ids, val_attention_masks, val_labels = tokenize_data(val_data, tokenizer, max_seq_len)

100%|██████████| 1200/1200 [00:05<00:00, 216.14it/s]
100%|██████████| 300/300 [00:00<00:00, 319.66it/s]


In [None]:
# DataLoader
batch_size = 24
train_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [None]:
# Modèle DistilBERT pour classification multi-classe
model = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

You are using a model of type bert to instantiate a model of type distilbert. This is not supported for all configurations of models and can yield errors.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.word_embeddings.weight', 'pre_classifier.bias', 'pre_classifier.weight', 'transformer.layer.0.attention.k_lin.bias', 'transformer.layer.0.attention.k_lin.weight', 'transformer.layer.0.attention.out_lin.bias', 'transformer.layer.0.attention.out_lin.weight', 'transformer.layer.0.attention.q_lin.bias', 'transformer.layer.0.attention.q_lin.weight', 'transformer.layer.0.attention.v_lin.bias', 'transformer.layer.0.attention.v_lin.weight', 'transformer.layer.0.ffn.lin1.bias', 'transformer.layer.0.ffn.lin1.weight', 'transformer.layer.0.ffn.lin2.b

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-11): 12 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
# Optimizer et scheduler
num_epochs = 15
total_steps = len(train_dataloader) * num_epochs
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
# Fonction d'entraînement
def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in tqdm(dataloader, desc="Training"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    return total_loss / len(dataloader)

In [None]:
# Fonction d'évaluation
def evaluate(model, dataloader, device):
    model.eval()
    predictions, true_labels = [], []

    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids, attention_masks, labels = [t.to(device) for t in batch]
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = labels.cpu().numpy()
        predictions.extend(logits.argmax(axis=-1))
        true_labels.extend(label_ids)

    return accuracy_score(true_labels, predictions), classification_report(true_labels, predictions, digits=4)


In [25]:
# Entraînement du modèle
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_dataloader, optimizer, scheduler, device)
    val_accuracy, report = evaluate(model, val_dataloader, device)
    print(f"\nEpoch {epoch + 1}/{num_epochs}")
    print(f"Loss: {train_loss:.4f} - Validation Accuracy: {val_accuracy:.4f}")
    print(report)

Training: 100%|██████████| 2250/2250 [19:41<00:00,  1.90it/s]
Evaluating: 100%|██████████| 563/563 [01:35<00:00,  5.91it/s]



Epoch 1/5
Loss: 0.9311 - Validation Accuracy: 0.6578
              precision    recall  f1-score   support

           0     0.5146    0.4242    0.4651      3451
           1     0.2261    0.2664    0.2446      2793
           2     0.3913    0.0641    0.1102      5583
           3     0.4000    0.1632    0.2318     15023
           4     0.7199    0.9380    0.8146     45138

    accuracy                         0.6578     71988
   macro avg     0.4504    0.3712    0.3733     71988
weighted avg     0.5987    0.6578    0.5995     71988



Training: 100%|██████████| 2250/2250 [19:42<00:00,  1.90it/s]
Evaluating: 100%|██████████| 563/563 [01:35<00:00,  5.89it/s]



Epoch 2/5
Loss: 0.8422 - Validation Accuracy: 0.6678
              precision    recall  f1-score   support

           0     0.5474    0.5016    0.5235      3451
           1     0.2999    0.1497    0.1997      2793
           2     0.3501    0.1698    0.2287      5583
           3     0.4031    0.1940    0.2619     15023
           4     0.7315    0.9318    0.8196     45138

    accuracy                         0.6678     71988
   macro avg     0.4664    0.3894    0.4067     71988
weighted avg     0.6078    0.6678    0.6192     71988



Training: 100%|██████████| 2250/2250 [19:42<00:00,  1.90it/s]
Evaluating: 100%|██████████| 563/563 [01:35<00:00,  5.90it/s]



Epoch 3/5
Loss: 0.8150 - Validation Accuracy: 0.6694
              precision    recall  f1-score   support

           0     0.6050    0.4491    0.5155      3451
           1     0.3057    0.1532    0.2041      2793
           2     0.3671    0.1680    0.2305      5583
           3     0.4076    0.2125    0.2793     15023
           4     0.7301    0.9323    0.8189     45138

    accuracy                         0.6694     71988
   macro avg     0.4831    0.3830    0.4097     71988
weighted avg     0.6122    0.6694    0.6222     71988



Training: 100%|██████████| 2250/2250 [19:41<00:00,  1.90it/s]
Evaluating: 100%|██████████| 563/563 [01:35<00:00,  5.90it/s]



Epoch 4/5
Loss: 0.7980 - Validation Accuracy: 0.6685
              precision    recall  f1-score   support

           0     0.5160    0.5471    0.5311      3451
           1     0.3008    0.1719    0.2187      2793
           2     0.3563    0.1850    0.2436      5583
           3     0.4191    0.2009    0.2716     15023
           4     0.7364    0.9239    0.8195     45138

    accuracy                         0.6685     71988
   macro avg     0.4657    0.4057    0.4169     71988
weighted avg     0.6132    0.6685    0.6234     71988



Training: 100%|██████████| 2250/2250 [19:42<00:00,  1.90it/s]
Evaluating: 100%|██████████| 563/563 [01:35<00:00,  5.90it/s]


Epoch 5/5
Loss: 0.7857 - Validation Accuracy: 0.6692
              precision    recall  f1-score   support

           0     0.5337    0.5329    0.5333      3451
           1     0.2896    0.1604    0.2065      2793
           2     0.3561    0.1888    0.2468      5583
           3     0.4166    0.1999    0.2702     15023
           4     0.7361    0.9266    0.8204     45138

    accuracy                         0.6692     71988
   macro avg     0.4664    0.4017    0.4154     71988
weighted avg     0.6129    0.6692    0.6235     71988






In [29]:
# Sauvegarde du modèle
model.save_pretrained("/content/drive/MyDrive/SAE S6//model_A100")
tokenizer.save_pretrained("/content/drive/MyDrive/SAE S6//model_A100")


('./model_A100/tokenizer_config.json',
 './model_A100/special_tokens_map.json',
 './model_A100/vocab.txt',
 './model_A100/added_tokens.json')

In [1]:
# Fonction de prédiction
def predict_review(review, model, tokenizer, device):
    model.eval()
    encoded = tokenizer.encode_plus(
        review,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_attention_mask=True,
    )
    input_id = torch.tensor([encoded["input_ids"]]).to(device)
    attention_mask = torch.tensor([encoded["attention_mask"]]).to(device)
    with torch.no_grad():
        outputs = model(input_id, attention_mask=attention_mask)
    logits = outputs.logits.detach().cpu().numpy()
    predicted_score = logits.argmax(axis=-1)[0] + 1  # Recaler le score pour correspondre à l'échelle originale
    return predicted_score

In [2]:
from sklearn.metrics import confusion_matrix, classification_report

In [5]:
# Charger le modèle et le tokenizer
model_path = "/content/drive/MyDrive/SAE S6//model_A100"
model = DistilBertForSequenceClassification.from_pretrained(model_path)
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

OSError: Incorrect path_or_model_id: '/content/model_A100'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [31]:
# Fonction de prédiction
def predict_reviews(df, model, tokenizer, device):
    model.eval()
    predicted_scores = []

    for review in df['revue/texte']:
        encoded = tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
        )
        input_id = torch.tensor([encoded["input_ids"]]).to(device)
        attention_mask = torch.tensor([encoded["attention_mask"]]).to(device)

        with torch.no_grad():
            outputs = model(input_id, attention_mask=attention_mask)
        logits = outputs.logits.detach().cpu().numpy()
        predicted_score = logits.argmax(axis=-1)[0] + 1  # Recaler le score original
        predicted_scores.append(predicted_score)

    return predicted_scores

In [32]:
# Évaluation
def evaluate_model(df, model, tokenizer, device):
    df = df[['revue/texte', 'revue/score']].dropna()
    df['revue/score'] = df['revue/score'].astype(int)
    df['predicted_score'] = predict_reviews(df, model, tokenizer, device)

    # Calcul des métriques
    cm = confusion_matrix(df['revue/score'], df['predicted_score'])
    report = classification_report(df['revue/score'], df['predicted_score'], digits=4)

    return cm, report, df

In [33]:
# Import du jeu de données
data_url = '/content/drive/MyDrive/SAE S6/data_fusion_little.csv'
df = pd.read_csv(data_url)

# Ne garder que les colonnes nécessaires
df = df[['revue/texte', 'revue/score']]
df = df.dropna()

# Ne garder que 100 lignes au hasard
df = df.sample(n=10000, random_state=42)

df_test = pd.DataFrame(df)



KeyboardInterrupt: 

In [None]:
conf_matrix, class_report, df_results = evaluate_model(df_test, model, tokenizer, device)

# Affichage des résultats
print("Matrice de confusion :\n", conf_matrix)
print("Rapport de classification :\n", class_report)

Matrice de confusion :
 [[ 358   86   43   12   17]
 [ 128  107   85   33   21]
 [ 108  158  227  129  122]
 [  90  119  433  505  997]
 [ 204  151  556  792 4519]]
Rapport de classification :
               precision    recall  f1-score   support

           1     0.4032    0.6938    0.5100       516
           2     0.1723    0.2861    0.2151       374
           3     0.1689    0.3051    0.2174       744
           4     0.3433    0.2355    0.2794      2144
           5     0.7962    0.7263    0.7596      6222

    accuracy                         0.5716     10000
   macro avg     0.3768    0.4494    0.3963     10000
weighted avg     0.6088    0.5716    0.5831     10000

