In [8]:
import pandas as pd

data = pd.read_csv('dataset_cleaned.csv')


In [9]:
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize

model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')



In [6]:
embeddings = model.encode(
    data['email_cleaned'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

Batches:   0%|          | 0/625 [00:00<?, ?it/s]

In [10]:
embeddings_normalized = normalize(embeddings , norm='l2')

print(f"Shape des embeddings : {embeddings_normalized.shape}")

Shape des embeddings : (20000, 768)


### Stockage dans ChromaDB

In [12]:
import chromadb
from chromadb.config import Settings

client = chromadb.PersistentClient(path="./chromadb/tickets_collection")

# Créer ou récupérer une collection
collection = client.get_or_create_collection(
    name="support_tickets",
    metadata={"description": "IT support emails embeddings"}
)


Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [None]:
from tqdm import tqdm

# Définir la taille des batches
BATCH_SIZE = 5000  # En dessous de la limite de 5461

# Nombre total d'éléments
total_samples = len(data)

for i in tqdm(range(0, total_samples, BATCH_SIZE)):
    end_idx = min(i + BATCH_SIZE, total_samples)
    
    # Extraire le batch
    batch_embeddings = embeddings_normalized[i:end_idx].tolist()
    batch_documents = data['email_cleaned'].iloc[i:end_idx].tolist()
    batch_metadatas = data[['type', 'priority', 'queue']].iloc[i:end_idx].to_dict('records')
    batch_ids = [f"ticket_{j}" for j in range(i, end_idx)]
    
    # Ajouter à ChromaDB
    collection.add(
        embeddings=batch_embeddings,
        documents=batch_documents,
        metadatas=batch_metadatas,
        ids=batch_ids
    )

  0%|          | 0/4 [00:00<?, ?it/s]Failed to send telemetry event CollectionAddEvent: capture() takes 1 positional argument but 3 were given
100%|██████████| 4/4 [00:41<00:00, 10.28s/it]


### Sauvegarde des embeddings (pour l'entraînement)

In [15]:
import numpy as np

# pour l'utilisation ultérieure
np.save('embeddings.npy', embeddings_normalized)

# ajouter embeddings au dataset

data['embedding'] = list(embeddings_normalized)
data.to_pickle('data_with_emb.pkl')