In [2]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, TFDistilBertForTokenClassification, DataCollatorForTokenClassification
import tensorflow as tf
from seqeval.metrics import classification_report
import numpy as np
from pydantic import BaseModel
import nest_asyncio
from threading import Thread

  from .autonotebook import tqdm as notebook_tqdm


DATA

In [3]:
# Mappage des labels simplifié
label_map = {"person": 1, "content": 2, "O": 0}
# Chargement et prétraitement des données
def load_and_preprocess(filepath):
    df = pd.read_csv(filepath)
    df['words'] = df['words'].apply(lambda x: eval(x) if isinstance(x, str) else x)
    df['labels'] = df['labels'].apply(lambda x: [label_map.get(label, 0) for label in eval(x)] if isinstance(x, str) else x)
    return df

train = load_and_preprocess("DATA/train_2.csv")
test = load_and_preprocess("DATA/test.csv")



# Conversion des données en format Dataset
data = Dataset.from_pandas(train)
test_data = Dataset.from_pandas(test)


Tokenizer and label


In [4]:
# Initialisation du tokeniseur
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenisation et alignement des labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        labels.append([-100 if word_id is None else label[word_id] for word_id in word_ids])
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_data = data.map(tokenize_and_align_labels, batched=True)
tokenized_test_data = test_data.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 2931/2931 [00:00<00:00, 21152.34 examples/s]
Map: 100%|██████████| 1991/1991 [00:00<00:00, 26826.97 examples/s]


Model

In [5]:
# Création du modèle
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_map))

# Préparation des données pour l'entraînement et l'évaluation
def prepare_dataset(tokenized_data, batch_size=16, max_length=128):
    def align_and_truncate(data):
        # Alignement et troncature des séquences et des masques d'attention
        input_ids = tf.keras.preprocessing.sequence.pad_sequences(data['input_ids'], maxlen=max_length, dtype='long', padding='post', truncating='post')
        attention_mask = tf.keras.preprocessing.sequence.pad_sequences(data['attention_mask'], maxlen=max_length, dtype='long', padding='post', truncating='post')
        labels = tf.keras.preprocessing.sequence.pad_sequences(data['labels'], maxlen=max_length, dtype='long', padding='post', truncating='post', value=-100)
        return {'input_ids': input_ids, 'attention_mask': attention_mask}, labels

    features, labels = align_and_truncate(tokenized_data)
    return tf.data.Dataset.from_tensor_slices((features, labels)).shuffle(10000).batch(batch_size)

train_dataset = prepare_dataset(tokenized_data)
test_dataset = prepare_dataset(tokenized_test_data)

# Custom loss pour ignorer les labels -100 lors du calcul de la perte
def custom_loss(y_true, y_pred):
    active_loss = tf.reshape(y_true, (-1,)) != -100
    reduced_logits = tf.boolean_mask(tf.reshape(y_pred, (-1, tf.shape(y_pred)[2])), active_loss)
    reduced_labels = tf.boolean_mask(tf.reshape(y_true, (-1,)), active_loss)
    return tf.keras.losses.sparse_categorical_crossentropy(reduced_labels, reduced_logits, from_logits=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=custom_loss)

# Entraînement du modèle
history = model.fit(train_dataset, epochs=1)

# Évaluation du modèle
eval_loss = model.evaluate(test_dataset)
print(f"Perte sur l'ensemble de test: {eval_loss}")




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

Perte sur l'ensemble de test: 0.05251718685030937


In [6]:
# Remplacez 'chemin/vers/sauvegarde_classification' par votre chemin réel
model.save('Model/named_entityr_ecognition')

INFO:tensorflow:Assets written to: Model/named_entityr_ecognition\assets


INFO:tensorflow:Assets written to: Model/named_entityr_ecognition\assets


prediction and metrics

In [7]:
# Prédiction et calcul des métriques sur l'ensemble de test
label_map_inv = {v: k for k, v in label_map.items()}  # Inversion du mappage des labels

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    out_label_list, preds_list = [], []
    for i in range(batch_size):
        out_label_list.append([label_map_inv.get(label_ids[i][j], "O") for j in range(seq_len) if label_ids[i][j] != -100])
        preds_list.append([label_map_inv.get(preds[i][j], "O") for j in range(seq_len) if label_ids[i][j] != -100])
    return preds_list, out_label_list

predictions, true_labels = [], []
for batch in test_dataset:
    logits = model.predict(batch[0])['logits']
    labels = batch[1].numpy()
    preds, labels = align_predictions(logits, labels)
    predictions.extend(preds)
    true_labels.extend(labels)

print(classification_report(true_labels, predictions))

# Extraction des entités à partir d'une phrase donnée
def extract_entities(sentence):
    inputs = tokenizer(sentence, return_tensors="tf", padding=True, truncation=True, max_length=128)
    predictions = model.predict([inputs["input_ids"], inputs["attention_mask"]])
    predicted_label_indices = np.argmax(predictions.logits, axis=2)[0]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].numpy()[0])
    labels = [label_map_inv.get(idx, "O") for idx in predicted_label_indices]
    entities = {"person": [], "content": []}
    for token, label in zip(tokens, labels):
        if label in entities:
            entities[label].append(token)
    return entities

# Exemple d'utilisation de la fonction extract_entities (test pour voir si ça marche)
extracted_entities = extract_entities("Please remind Alice to bring the financial reports to the conference next Wednesday.")

result = {
    "job": "send_message",
    "receiver": " ".join(extracted_entities["person"]),
    "content": " ".join(extracted_entities["content"]),
}
print(result)






              precision    recall  f1-score   support

       erson       0.89      0.92      0.90       478
      ontent       0.41      0.56      0.47       149

   micro avg       0.75      0.84      0.79       627
   macro avg       0.65      0.74      0.69       627
weighted avg       0.77      0.84      0.80       627

{'job': 'send_message', 'receiver': 'alice', 'content': 'to bring the financial reports to the conference wednesday [SEP]'}


TP5

Partie 1

In [23]:
from transformers import DistilBertTokenizer
import pandas as pd

# Initialisation du tokenizer de DistilBert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

data = pd.read_csv('DATA/question_classif.csv')
print(data.head())

# Tokenisation des questions
inputs = tokenizer(list(data['question']), padding=True, truncation=True, return_tensors="pt", max_length=128)
print(inputs[:5])

# Préparation des labels
labels = data['label'].values

                                            question    label_text  label
0  What are the recommended prerequisites for the...  question_rag      0
1  Does the cybersecurity course cover intrusion ...  question_rag      0
2             How can I enroll in the Python course?  question_rag      0
3  What are the main basic concepts covered in th...  question_rag      0
4  Does the React course include practical projec...  question_rag      0
{'input_ids': tensor([[  101,  2054,  2024,  1996,  6749,  3653,  2890, 24871,  2015,  2005,
          1996,  4955,  2000,  3698,  4083,  2607,  1029,   102,     0,     0,
             0,     0],
        [  101,  2515,  1996, 16941,  3366, 10841, 15780,  2607,  3104, 24554,
         10788,  4725,  1029,   102,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  2129,  2064,  1045, 25612,  1999,  1996, 18750,  2607,  1029,
           102,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     

In [24]:
from transformers import DistilBertForSequenceClassification
from torch import nn
import torch

class CustomDistilBert(nn.Module):
    def __init__(self, num_labels=2):
        super(CustomDistilBert, self).__init__()
        self.distilbert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)
        # Vous pouvez ajouter ou modifier des couches ici si nécessaire

    def forward(self, input_ids, attention_mask=None, labels=None):
        output = self.distilbert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return output
    
    def predict(self, input_ids, attention_mask=None):
        self.eval()  # mettre le modèle en mode évaluation
        with torch.no_grad():
            outputs = self(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
        return predictions
# Initialisation du modèle adapté
model = CustomDistilBert(num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:

from sklearn.model_selection import train_test_split
import torch


# Préparation des données avant la division
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
labels_tensor = torch.tensor(labels)

# Division des données en ensembles d'entraînement et de test
train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels = train_test_split(
    input_ids, attention_mask, labels_tensor, test_size=0.2, random_state=42)



In [26]:
from torch.utils.data import TensorDataset, DataLoader

# Correction dans la création des TensorDatasets
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)

# Création des DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [27]:
import torch

# Define device as CPU since CUDA is not available on Mac M1
device = torch.device("cpu")

# Before starting training, make sure to move your model to the chosen device
model.to(device)

CustomDistilBert(
  (distilbert): DistilBertForSequenceClassification(
    (distilbert): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0-5): 6 x TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      

In [28]:
from transformers import AdamW
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import torch

# Configuration de l'optimiseur
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fonction pour l'entraînement
def train(model, train_loader, optimizer):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch = [b.to(device) for b in batch]  # Envoyer le batch au device (GPU ou CPU)
        inputs, masks, labels = batch
        model.zero_grad()  # Réinitialiser les gradients
        outputs = model(inputs, attention_mask=masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()  # Calcul des gradients
        optimizer.step()  # Mise à jour des poids
    average_loss = total_loss / len(train_loader)
    print(f"Training loss: {average_loss}")

# Fonction pour l'évaluation
def evaluate(model, test_loader):
    model.eval()
    total_eval_accuracy = 0
    for batch in test_loader:
        batch = [b.to(device) for b in batch]
        inputs, masks, labels = batch
        with torch.no_grad():
            outputs = model(inputs, attention_mask=masks)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        total_eval_accuracy += (predictions == labels).sum().item()
    avg_accuracy = total_eval_accuracy / len(test_loader.dataset)
    print(f"Accuracy: {avg_accuracy}")

# Exécution de l'entraînement et de l'évaluation
epochs = 20
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    train(model, train_loader, optimizer)
    evaluate(model, test_loader)

# Après chaque époque, vous pouvez appeler `evaluate(model, test_loader)` pour obtenir l'accuracy sur l'ensemble de test

# Pour afficher une prédiction :
model.eval()  # Met le modèle en mode évaluation
with torch.no_grad():
    # Exemple de question tokenisée
    sample_question = "Ask the python teacher where is next class"
    inputs = tokenizer(sample_question, padding=True, truncation=True, return_tensors="pt")
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    outputs = model(input_ids, attention_mask)
    prediction = torch.argmax(outputs.logits, dim=1)
    
    print(f"Prédiction : {prediction.item()}")  # Doit afficher 1

    sample_question = "What are the pre-requisite for the python class?"
    inputs = tokenizer(sample_question, padding=True, truncation=True, return_tensors="pt")
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    
    outputs = model(input_ids, attention_mask)
    prediction = torch.argmax(outputs.logits, dim=1)
    
    print(f"Prédiction : {prediction.item()}")  # Doit afficher 0



Epoch 1/20
Training loss: 0.5451419472694397
Accuracy: 1.0
Epoch 2/20
Training loss: 0.21126106530427932
Accuracy: 1.0
Epoch 3/20
Training loss: 0.06777917966246605
Accuracy: 1.0
Epoch 4/20
Training loss: 0.02876548394560814
Accuracy: 1.0
Epoch 5/20
Training loss: 0.015053039230406285
Accuracy: 1.0
Epoch 6/20
Training loss: 0.009596196562051773
Accuracy: 1.0
Epoch 7/20
Training loss: 0.006164871156215668
Accuracy: 1.0
Epoch 8/20
Training loss: 0.00478506488725543
Accuracy: 1.0
Epoch 9/20
Training loss: 0.003982764249667525
Accuracy: 1.0
Epoch 10/20
Training loss: 0.0031928871758282185
Accuracy: 1.0
Epoch 11/20
Training loss: 0.0027028200682252647
Accuracy: 1.0
Epoch 12/20
Training loss: 0.002308120997622609
Accuracy: 1.0
Epoch 13/20
Training loss: 0.002013955032452941
Accuracy: 1.0
Epoch 14/20
Training loss: 0.0019103370839729905
Accuracy: 1.0
Epoch 15/20
Training loss: 0.0016954287653788923
Accuracy: 1.0
Epoch 16/20
Training loss: 0.0015026237815618515
Accuracy: 1.0
Epoch 17/20
Traini

In [15]:
torch.save(model,"Model/text_classification.pth")

In [29]:
def classify_task(text):
    model.eval()  # Assurez-vous que le modèle est en mode évaluation
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():  # Désactive le calcul du gradient pour accélérer cette opération
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs[0]
    prediction = torch.argmax(logits, dim=-1).item()

    # Déterminer la tâche basée sur la prédiction
    if prediction == 0:
        return "question_rag"
    else:
        return "send_message"

# Exemple d'utilisation
print(classify_task("Ask the python teacher where is next class"))
print(classify_task("What are the pre-requisites for the python class?"))


send_message
question_rag


Partie 2

In [30]:
def send_virtual_assistant(user_input):
    # Étape 1: Classifier la tâche
    task = classify_task(user_input)
    
    # Étape 2: Extraction des entités (si nécessaire)
    if task == "send_message":
        entities = extract_entities(user_input)
        receiver = " ".join(entities.get("person", []))  # Exemple d'extraction du destinataire
        message = user_input  # Utiliser l'entrée utilisateur directement ou extraire un message spécifique
        # Étape 3: Exécution de la tâche
        response = send_message(receiver, message)
    elif task == "question_rag":
        # Pour question_rag, l'entrée utilisateur est la question
        question = user_input
        # Étape 3: Exécution de la tâche
        response = ask_RAG(question)
    else:
        response = "Sorry, I couldn't understand your request."
    
    # Étape 4: Réponse à l'utilisateur
    return response

# Simulations des fonctions send_message et ask_RAG pour le test
def send_message(receiver, message):
    return f'Message sent to {receiver}: "{message}"'

def ask_RAG(question):
    return f'Asked RAG: "{question}"\nThe RAG replied: "Here is the information you requested."'

# Test de la pipeline
print(send_virtual_assistant("Ask the python teacher where is next class"))
print(send_virtual_assistant("What are the pre-requisites for the python class?"))


TypeError: list indices must be integers or slices, not tuple