In [2]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertTokenizerFast, TFDistilBertForTokenClassification, DataCollatorForTokenClassification
import tensorflow as tf
from seqeval.metrics import classification_report
import numpy as np
from pydantic import BaseModel
import nest_asyncio
from threading import Thread

DATA

In [3]:
# Mappage des labels simplifié
label_map = {"person": 1, "content": 2, "O": 0}
# Chargement et prétraitement des données
def load_and_preprocess(filepath):
    df = pd.read_csv(filepath)
    df['words'] = df['words'].apply(lambda x: eval(x) if isinstance(x, str) else x)
    df['labels'] = df['labels'].apply(lambda x: [label_map.get(label, 0) for label in eval(x)] if isinstance(x, str) else x)
    return df

train = load_and_preprocess("data/Data/train_2.csv")
test = load_and_preprocess("data/Data/test.csv")



# Conversion des données en format Dataset
data = Dataset.from_pandas(train)
test_data = Dataset.from_pandas(test)


Tokenizer and label


In [4]:
# Initialisation du tokeniseur
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenisation et alignement des labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        labels.append([-100 if word_id is None else label[word_id] for word_id in word_ids])
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_data = data.map(tokenize_and_align_labels, batched=True)
tokenized_test_data = test_data.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 2931/2931 [00:00<00:00, 12547.39 examples/s]
Map: 100%|██████████| 1991/1991 [00:00<00:00, 13935.15 examples/s]


Model

In [5]:
# Création du modèle
model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_map))

# Préparation des données pour l'entraînement et l'évaluation
def prepare_dataset(tokenized_data, batch_size=16, max_length=128):
    def align_and_truncate(data):
        # Alignement et troncature des séquences et des masques d'attention
        input_ids = tf.keras.preprocessing.sequence.pad_sequences(data['input_ids'], maxlen=max_length, dtype='long', padding='post', truncating='post')
        attention_mask = tf.keras.preprocessing.sequence.pad_sequences(data['attention_mask'], maxlen=max_length, dtype='long', padding='post', truncating='post')
        labels = tf.keras.preprocessing.sequence.pad_sequences(data['labels'], maxlen=max_length, dtype='long', padding='post', truncating='post', value=-100)
        return {'input_ids': input_ids, 'attention_mask': attention_mask}, labels

    features, labels = align_and_truncate(tokenized_data)
    return tf.data.Dataset.from_tensor_slices((features, labels)).shuffle(10000).batch(batch_size)

train_dataset = prepare_dataset(tokenized_data)
test_dataset = prepare_dataset(tokenized_test_data)

# Custom loss pour ignorer les labels -100 lors du calcul de la perte
def custom_loss(y_true, y_pred):
    active_loss = tf.reshape(y_true, (-1,)) != -100
    reduced_logits = tf.boolean_mask(tf.reshape(y_pred, (-1, tf.shape(y_pred)[2])), active_loss)
    reduced_labels = tf.boolean_mask(tf.reshape(y_true, (-1,)), active_loss)
    return tf.keras.losses.sparse_categorical_crossentropy(reduced_labels, reduced_logits, from_logits=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=custom_loss)

# Entraînement du modèle
history = model.fit(train_dataset, epochs=1)

# Évaluation du modèle
eval_loss = model.evaluate(test_dataset)
print(f"Perte sur l'ensemble de test: {eval_loss}")


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForTokenClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForTokenClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able t

Perte sur l'ensemble de test: 0.05273259058594704


prediction and metrics

In [6]:
# Prédiction et calcul des métriques sur l'ensemble de test
label_map_inv = {v: k for k, v in label_map.items()}  # Inversion du mappage des labels

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    out_label_list, preds_list = [], []
    for i in range(batch_size):
        out_label_list.append([label_map_inv.get(label_ids[i][j], "O") for j in range(seq_len) if label_ids[i][j] != -100])
        preds_list.append([label_map_inv.get(preds[i][j], "O") for j in range(seq_len) if label_ids[i][j] != -100])
    return preds_list, out_label_list

predictions, true_labels = [], []
for batch in test_dataset:
    logits = model.predict(batch[0])['logits']
    labels = batch[1].numpy()
    preds, labels = align_predictions(logits, labels)
    predictions.extend(preds)
    true_labels.extend(labels)

print(classification_report(true_labels, predictions))

# Extraction des entités à partir d'une phrase donnée
def extract_entities(sentence):
    inputs = tokenizer(sentence, return_tensors="tf", padding=True, truncation=True, max_length=128)
    predictions = model.predict([inputs["input_ids"], inputs["attention_mask"]])
    predicted_label_indices = np.argmax(predictions.logits, axis=2)[0]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"].numpy()[0])
    labels = [label_map_inv.get(idx, "O") for idx in predicted_label_indices]
    entities = {"person": [], "content": []}
    for token, label in zip(tokens, labels):
        if label in entities:
            entities[label].append(token)
    return entities

# Exemple d'utilisation de la fonction extract_entities (test pour voir si ça marche)
extracted_entities = extract_entities("Please remind Alice to bring the financial reports to the conference next Wednesday.")

result = {
    "job": "send_message",
    "receiver": " ".join(extracted_entities["person"]),
    "content": " ".join(extracted_entities["content"]),
}
print(result)






              precision    recall  f1-score   support

       erson       0.84      0.94      0.89       478
      ontent       0.60      0.56      0.58       149

   micro avg       0.79      0.85      0.82       627
   macro avg       0.72      0.75      0.74       627
weighted avg       0.79      0.85      0.82       627

{'job': 'send_message', 'receiver': 'alice', 'content': 'to bring the financial reports to the conference next wednesday [SEP]'}


API

In [46]:
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn

app = FastAPI()

class Sentence(BaseModel):
    test_sentence: str


# Simulez votre fonction d'extraction d'entités ici
# Remplacez cette logique par votre véritable fonction d'extraction d'entités
def extract_entities(sentence):
    # Exemple d'entités extraites. À remplacer par votre extraction réelle.
    return {
        "person": ["Alice"],  # Simule l'extraction de noms de personnes
        "content": ["financial reports"]  # Simule l'extraction de contenus
    }

@app.post("/process_sentence/")
async def process_sentence(sentence: Sentence):
    extracted_entities = extract_entities(sentence.test_sentence)
    
    # Formatage de la réponse selon les spécifications
    result = {
        "job": "send_message",
        "receiver": " ".join(extracted_entities.get("person", [])),
        "content": " ".join(extracted_entities.get("content", [])),
    }
    return result

# Point d'entrée pour Uvicorn si ce fichier est exécuté directement
if __name__ == "__main__":
    uvicorn.run(app, host="127.0.0.1", port=8001)


INFO:     Started server process [32204]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8001 (Press CTRL+C to quit)


INFO:     127.0.0.1:57003 - "GET / HTTP/1.1" 404 Not Found
INFO:     127.0.0.1:57039 - "POST /process_sentence/ HTTP/1.1" 404 Not Found


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [32204]
