## Ce fichier consiste à :
- Chargement des données
- Tokenisation avec Hugging Face
- Conversion en Dataset Hugging Face
- Fine-tuning de DistilBERT
- Évaluation (accuracy, F1)
- Sauvegarde du modèle

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import f1_score, accuracy_score

In [2]:
df = pd.read_csv('../data/goemotions_clean.csv')

texts = df['text_clean'].tolist()
labels = df.drop(columns=['text_clean']).values
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.1, random_state=42)

### Tokenisation

In [3]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_batch(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

### Créer un Dataset Hugging Face

In [4]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
       
        texts = [str(x) for x in texts]
        self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=128)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx]).float()  # multi-label = float
        return item

train_dataset = EmotionDataset(X_train, y_train, tokenizer)
val_dataset   = EmotionDataset(X_val, y_val, tokenizer)

### Fine-tuning avec DistilBERT

In [5]:
num_labels = y_train.shape[1]
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    problem_type="multi_label_classification"
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### Sauvegarde du modèle fine-tuné et du tokenizer

In [6]:
model.save_pretrained("../models/emotion_distilbert", safe_serialization=False)
tokenizer.save_pretrained("../models/emotion_distilbert")

('../models/emotion_distilbert\\tokenizer_config.json',
 '../models/emotion_distilbert\\special_tokens_map.json',
 '../models/emotion_distilbert\\vocab.txt',
 '../models/emotion_distilbert\\added_tokens.json',
 '../models/emotion_distilbert\\tokenizer.json')