In [137]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import re
import html
import unicodedata

import tqdm

from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

from sklearn.manifold import TSNE
import plotly.express as px  # Add this import for Plotly


# Or if that doesn't work, try this (adjust path as needed):
import sys
sys.path.append('..')  # Add current directory to path

# ----- LLM Cluster Labeling -----
from src.llm.openrouter import openrouter_llm_api_call
import random




# Set up display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_colwidth', 100)

In [138]:
# Define the project and path
ACTIVE_PROJECT = "Projet Demo"
# embeddings_path = os.path.join('data', "Projects", ACTIVE_PROJECT, 'emails_with_embeddings_sample.pkl')
embeddings_path = os.path.join('data', "Projects", ACTIVE_PROJECT, 'emails_with_embeddings_improved.pkl')


# Load the data
df = pd.read_pickle(embeddings_path)
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")

DataFrame shape: (18978, 4)

Columns: ['message_id', 'body', 'original_body', 'embeddings']


In [139]:
# Check how many emails have embeddings
has_embeddings = df['embeddings'].notna()
embedding_count = has_embeddings.sum()
empty_count = (~has_embeddings).sum()

print(f"Emails with embeddings: {embedding_count} ({embedding_count/len(df):.2%})")
print(f"Emails without embeddings: {empty_count} ({empty_count/len(df):.2%})")

Emails with embeddings: 16198 (85.35%)
Emails without embeddings: 2780 (14.65%)


In [140]:
df_emails_with_embeddings = df[has_embeddings]
df_emails_with_embeddings.shape

(16198, 4)

In [141]:

def preprocess_email_text(text):
    """Applies all preprocessing steps to an email body text"""
    if pd.isna(text) or not isinstance(text, str):
        return ""

    # Step 1: Clean formatting issues
    # Decode HTML entities (like &nbsp;)
    text = html.unescape(text)

    # Normalize Unicode (convert different forms to standard form)
    text = unicodedata.normalize('NFKC', text)

    # Replace problematic non-breaking spaces with regular spaces
    text = text.replace('\xa0', ' ')

    # Convert multiple spaces to single space
    text = re.sub(r'\s+', ' ', text)

    # Remove extra line breaks
    text = re.sub(r'\n\s*\n\s*\n+', '\n\n', text)

    text = text.strip()

    # Step 2: Remove nested conversations
    # Common email forwarding/reply patterns
    patterns = [
        r'-----Original Message-----.*',
        r'From:.*?Sent:.*?To:.*?Subject:.*?',
        r'De\s*:.*?Envoyé\s*:.*?À\s*:.*?',  # French version
        r'Von:.*?Gesendet:.*?An:.*?Betreff:.*?',  # German version
        r'On.*wrote:.*',
        r'Le.*a écrit :.*',  # French version
        r'>.*',  # Quoted text in replies
    ]

    # Try to find the first occurrence of any pattern
    for pattern in patterns:
        match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
        if match:
            # Keep only the text before the nested content
            text = text[:match.start()].strip()

    # Step 3: Truncate at sentence end
    max_length = 8000
    if len(text) <= max_length:
        return text

    # Cut at max_length
    truncated = text[:max_length]

    # Find the last sentence boundary (., !, ?)
    sentence_ends = list(re.finditer(r'[.!?]["\'\)\]]?\s+', truncated))
    if sentence_ends:
        # Use the position of the last found sentence end
        last_end = sentence_ends[-1]
        return truncated[:last_end.end()].strip()

    # If there are no sentence boundaries at all, just return the truncated text
    return truncated.strip()

In [142]:
df_emails_with_embeddings['processed_body'] = df_emails_with_embeddings['body'].apply(preprocess_email_text)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [143]:
# Assuming df is your DataFrame with the embeddings column

# Define a function to extract just the embedding array
def extract_embedding_values(embedding_obj):
    if pd.isna(embedding_obj):
        return None

    # Check if the embedding is a dictionary with the 'embedding' key
    if isinstance(embedding_obj, dict) and 'embedding' in embedding_obj:
        return embedding_obj['embedding']

    # Return the embedding as-is if it's already just the array
    return embedding_obj

# Apply the function to create the new column
df_emails_with_embeddings['embeddings_values'] = df_emails_with_embeddings['embeddings'].apply(extract_embedding_values)

# Verify the new column
if not df_emails_with_embeddings['embeddings_values'].isna().all():
    print(f"Successfully extracted embedding values with shape: {len(df_emails_with_embeddings['embeddings_values'].dropna().iloc[0])}")
else:
    print("No valid embeddings were found")


Successfully extracted embedding values with shape: 512




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [144]:
df_emails_with_embeddings.head(2)

Unnamed: 0,message_id,body,original_body,embeddings,processed_body,embeddings_values
0,<8aa028c40ef7146ed18990fc0c9e7a19@archivistes.be>,"Bonjour, Je vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de c...","Bonjour,\nJe vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de ...","{'object': 'embedding', 'index': 0, 'embedding': [0.15065436, -0.16268617, 0.099050604, 0.034771...","Bonjour, Je vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de c...","[0.15065436, -0.16268617, 0.099050604, 0.03477189, -0.003733333, -0.12344437, -0.033047672, 0.08..."
1,<CE817F92-BE3D-42A0-A833-A08C3B1E7C4A@free.fr>,"Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","{'object': 'embedding', 'index': 1, 'embedding': [0.14858536, -0.17183514, 0.08667003, 0.0430086...","Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","[0.14858536, -0.17183514, 0.08667003, 0.043008618, 0.0032017394, -0.12493396, -0.0075426535, 0.1..."


In [145]:
df_emails_with_embeddings

Unnamed: 0,message_id,body,original_body,embeddings,processed_body,embeddings_values
0,<8aa028c40ef7146ed18990fc0c9e7a19@archivistes.be>,"Bonjour, Je vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de c...","Bonjour,\nJe vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de ...","{'object': 'embedding', 'index': 0, 'embedding': [0.15065436, -0.16268617, 0.099050604, 0.034771...","Bonjour, Je vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de c...","[0.15065436, -0.16268617, 0.099050604, 0.03477189, -0.003733333, -0.12344437, -0.033047672, 0.08..."
1,<CE817F92-BE3D-42A0-A833-A08C3B1E7C4A@free.fr>,"Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","{'object': 'embedding', 'index': 1, 'embedding': [0.14858536, -0.17183514, 0.08667003, 0.0430086...","Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","[0.14858536, -0.17183514, 0.08667003, 0.043008618, 0.0032017394, -0.12493396, -0.0075426535, 0.1..."
2,<01ea01d700bc$2453c3c0$6cfb4b40$@archivistes.qc.ca>,"Bonjour Céline et merci beaucoup !Ce sera un plaisir de vous retrouver, en effet. Bien cordialem...","Bonjour Céline et merci beaucoup !Ce sera un plaisir de vous retrouver, en effet. Bien cordialem...","{'object': 'embedding', 'index': 2, 'embedding': [0.14658116, -0.16300967, 0.1005806, 0.03991575...","Bonjour Céline et merci beaucoup !Ce sera un plaisir de vous retrouver, en effet. Bien cordialem...","[0.14658116, -0.16300967, 0.1005806, 0.039915755, -0.03623631, -0.12074596, -0.02459933, 0.10424..."
3,<014d01d6f97f$b6450df0$22cf29d0$@archivistes.qc.ca>,"Bonjour, C'est avec plaisir que nous vous compterons parmi nous pour le prochain congrès de l'AA...","Bonjour, C'est avec plaisir que nous vous compterons parmi nous pour le prochain congrès de l'AA...","{'object': 'embedding', 'index': 3, 'embedding': [0.17201696, -0.16402929, 0.112226546, 0.046063...","Bonjour, C'est avec plaisir que nous vous compterons parmi nous pour le prochain congrès de l'AA...","[0.17201696, -0.16402929, 0.112226546, 0.046063617, -0.018504383, -0.12524809, -0.03254767, 0.09..."
4,<cf77b68bfb2844369ec522a5aac4205a@archivistes.org>,"Bonjour Vincent, 11h ou 11h30, cela te conviendrait ? Bien à toi Anne ClercDéléguée GénéraleAsso...","Bonjour Vincent, 11h ou 11h30, cela te conviendrait ? Bien à toi Anne ClercDéléguée GénéraleAs...","{'object': 'embedding', 'index': 4, 'embedding': [0.151289, -0.16139962, 0.107311085, 0.04889077...","Bonjour Vincent, 11h ou 11h30, cela te conviendrait ? Bien à toi Anne ClercDéléguée GénéraleAsso...","[0.151289, -0.16139962, 0.107311085, 0.04889077, -0.036429998, -0.11085595, -0.013119226, 0.1119..."
...,...,...,...,...,...,...
18973,<a09073a0f0334be8bd8a888a9995e15a@nouvelle-aquitaine.fr>,Présente ! Myriam FAVREAU Chargée de mission études et recherches Site de Poitiers Service Patri...,Présente !\nMyriam FAVREAU\nChargée de mission études et recherches\nSite de Poitiers\nService P...,"{'object': 'embedding', 'index': 3, 'embedding': [0.119224735, -0.1908262, 0.16395493, 0.1080550...",Présente ! Myriam FAVREAU Chargée de mission études et recherches Site de Poitiers Service Patri...,"[0.119224735, -0.1908262, 0.16395493, 0.108055055, -0.100029305, -0.08729087, -0.03536176, 0.114..."
18974,<f3f3dd29a5d4433990d42f9b02efea27@mairie-lyon.fr>,"Bonjour, Je serai là. Bonne journée. Louis Faivre d'Arcier Directeur des Archives municipales de...","Bonjour,\nJe serai là.\nBonne journée.\nLouis Faivre d'Arcier\nDirecteur des Archives municipale...","{'object': 'embedding', 'index': 4, 'embedding': [0.124798544, -0.18187873, 0.15561144, 0.112304...","Bonjour, Je serai là. Bonne journée. Louis Faivre d'Arcier Directeur des Archives municipales de...","[0.124798544, -0.18187873, 0.15561144, 0.11230402, -0.09190622, -0.09645757, -0.022378646, 0.118..."
18975,<413958f15c4f468298ba9389ecbdbfbf@archivistes.org>,"Bonjour La jauge pour l’AG est de 70 personnes, aussi pouvez-vous me confirmer au plus vite par ...","Bonjour\nLa jauge pour l’AG est de 70 personnes, aussi pouvez-vous me confirmer au plus vite par...","{'object': 'embedding', 'index': 5, 'embedding': [0.13658817, -0.18525131, 0.15825215, 0.1021218...","Bonjour La jauge pour l’AG est de 70 personnes, aussi pouvez-vous me confirmer au plus vite par ...","[0.13658817, -0.18525131, 0.15825215, 0.10212182, -0.08291965, -0.10004355, -0.026666656, 0.1128..."
18976,<1402ba6a-8a47-2b46-33a8-acb468a11a1e@gmail.com>,Bonjour à toutes,"Bonjour à toutes les\ntrois,\nMalheureusement impossible pour moi, car je serai dans un avion de...","{'object': 'embedding', 'index': 6, 'embedding': [0.11375994, -0.16344212, 0.16088776, 0.0747878...",Bonjour à toutes,"[0.11375994, -0.16344212, 0.16088776, 0.07478782, -0.050877243, -0.105056904, -0.018680988, 0.14..."


In [146]:
df_emails_with_embeddings["embeddings"]

0        {'object': 'embedding', 'index': 0, 'embedding': [0.15065436, -0.16268617, 0.099050604, 0.034771...
1        {'object': 'embedding', 'index': 1, 'embedding': [0.14858536, -0.17183514, 0.08667003, 0.0430086...
2        {'object': 'embedding', 'index': 2, 'embedding': [0.14658116, -0.16300967, 0.1005806, 0.03991575...
3        {'object': 'embedding', 'index': 3, 'embedding': [0.17201696, -0.16402929, 0.112226546, 0.046063...
4        {'object': 'embedding', 'index': 4, 'embedding': [0.151289, -0.16139962, 0.107311085, 0.04889077...
                                                        ...                                                 
18973    {'object': 'embedding', 'index': 3, 'embedding': [0.119224735, -0.1908262, 0.16395493, 0.1080550...
18974    {'object': 'embedding', 'index': 4, 'embedding': [0.124798544, -0.18187873, 0.15561144, 0.112304...
18975    {'object': 'embedding', 'index': 5, 'embedding': [0.13658817, -0.18525131, 0.15825215, 0.1021218...
18976    {'object':

In [147]:
index_counts = df_emails_with_embeddings['embeddings'].apply(lambda x: x['index']).value_counts()
index_counts

embeddings
0     272
1     272
2     272
3     272
4     272
     ... 
65    130
66    122
67    116
68    114
69    107
Name: count, Length: 70, dtype: int64

# Topic Embeddings

In [148]:
def convert_embedding_to_array(embedding):
    if pd.isna(embedding):
        return None
    if isinstance(embedding, dict) and 'embedding' in embedding:
        return embedding['embedding']
    return embedding


In [149]:
# ----- Cluster naming using TF-IDF -----
def get_top_keywords_per_cluster(docs, labels, top_n=30):
    # Ensure docs and labels have the same length
    if len(docs) != len(labels):
        min_len = min(len(docs), len(labels))
        docs = docs[:min_len]
        labels = labels[:min_len]
        print(f"Warning: Truncated data to {min_len} samples to match lengths")

    cluster_names = {}
    # Store frequent words for each cluster for LLM processing
    cluster_freq_words = {}

    for cluster_id in sorted(set(labels)):
        cluster_docs = [docs[i] for i in range(len(docs)) if labels[i] == cluster_id]

        # Convert frozenset to list for stop_words parameter
        french_stop_words = list(text.ENGLISH_STOP_WORDS) + [
            "cette", "cet", "ces", "ça", "ce" "où", "être", "avoir", "aussi", "comme",
            "plus", "moins", "très", "sans", "entre", "leur", "leurs", "donc",
            "ainsi", "etc", "que", "quel", "quelle", "quelles", "quels", "tout", "toute", "toutes",
            "je", "tu", "il", "elle", "nous", "vous", "ils", "elles",
            "mon", "ma", "mes", "me", "ton", "ta", "tes", "son", "sa", "ses",
            "notre", "nos", "votre", "vos", "bonjour", "et", "est", "aïe",
            "lui", "y", "en", "le", "la", "les", "un", "une", "des",
            "du", "de", "d'", "à", "au", "aux", "pour", "par", "avec",
            "sous", "sur", "dans", "vers", "depuis", "avant",
            "après", "parmi", "contre", "selon", "malgré", "auprès",
            "au-delà", "envers", "à travers", "auprès de",
            "à côté de", "en dehors de", "à l'intérieur de", "au lieu de",
            "1", "2", "3", "4", "5", "6", "7", "8", "9", "0",
            "10", "11", "12", "13", "14", "15", "16", "17", "18", "19",
            "20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
            "30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
            "40", "41", "42", "43", "44", "45", "46", "47", "48", "49",
            "50", "51", "52", "53", "54", "55", "56", "57", "58", "59",
            "60", "61", "62", "63", "64", "65", "66", "67", "68", "69",
            "70", "71", "72", "73", "74", "75", "76", "77", "78", "79",
            "80", "81", "82", "83", "84", "85", "86", "87", "88", "89",
            "90", "91", "92", "93", "94", "95", "96", "97", "98", "99",
            "00", "000", "0000", "00000", "000000", "0000000",
            "01", "02", "03", "04", "05", "06", "07", "08", "09",
            "merci", "bonsoir", "salut", "bien", "oui", "non", "peut-être",
            "s'il", "moi", "toi", "elle",
            "eux", "toi-même", "lui-même", "elle-même",
            "pas", "ne", "ni", "aucun", "aucune", "nul", "nulle",
            "rien", "personne", "quelqu'un", "quelque chose",
            "anne", "org", "com", "fr", "net", "info", "biz", "eu", "co",
            "uk", "it", "es", "de", "jp", "cn", "ru", "br", "in",
            "au", "ca", "mx", "za", "kr", "se", "no", "fi", "dk",
            "pl", "cz", "sk", "hu", "ro", "bg", "gr", "tr", "il",
            "ae", "sa", "qa", "kw", "om", "bh", "eg", "ma", "dz",
            "tn", "ly", "jo", "lb", "sy", "ye", "iq", "ir", "pk",
            "af", "bd", "lk", "np", "mm", "kh", "la", "vn", "th",
            "my", "sg", "ph", "hk", "tw", "jp", "kr", "au", "nz",
            "us", "ca", "mx", "br", "ar", "cl", "co", "pe", "ve",
            "ec", "uy", "py", "bo", "py", "do", "ht", "jm", "tt",
            "bz", "cr", "gt", "hn", "ni", "sv", "pa", "cu", "pr",
            "ai", "ce", "www", "http", "https", "ftp", "mailto", "telnet",
            "tel", "fax", "sms", "mms", "wap", "web", "www2", "www3",
            "marie", "jean", "paul", "pierre", "jacques", "sophie",
            "laurent", "nicolas", "françois", "philippe", "isabelle",
            "laura", "luc", "louis", "marc", "olivier", "vincent",
            "laurent", "catherine", "sylvie", "valérie", "caroline",
            "audrey", "céline", "marion", "claire", "sandra",
            "nathalie", "christine", "dominique", "elodie", "amelie",
            "audrey", "marie", "laurence", "sophie", "isabelle",
            "valérie", "caroline", "audrey", "marion", "claire",
            "sandra", "nathalie", "christine", "dominique", "elodie",
            "amelie", "laurence", "marie", "jean", "paul", "rue", "qui",
            "tous", "toutes", "tout", "toute", "tous", "toutes",
            "beaucoup", "mais", "bon", "aaf", "ok", "jégo", "semb",
            "suis", "sont", "sait", "sais", "savoir", "savoir-faire",
            "savoir-vivre", "savoir-être", "savoir-faire", "savoir-vivre",
            "savoir-être", "savoir-faire", "savoir-vivre", "savoir-être",
            "semble", "sembler",
            "ou", "si", "bonne", "français", "qu", "faire", "cela", "jégo75013", "était", "laure",
            "paristél", "français8", "orghttps", "lundi", "mardi", "mercredi", "jeudi", "vendredi",
            "samedi", "dimanche", "janvier", "février", "mars", "avril", "mai", "juin", "juillet",
            "août", "septembre", "octobre", "novembre", "décembre", "année", "mois", "semaine",
            "jour", "heure", "minute", "seconde", "avant-hier", "hier", "aujourd'hui", "demain",
            "après-demain", "peut", "pourquoi", "comment", "où", "quand", "qui", "que", "quoi",
            "te", "hélène", "end", "amarie",
            "fin", "daniel", "week", "bernard", "aff", "75013", "pourrait", "violaine",
            "céline"
        ]

        # Handle empty cluster case
        if not cluster_docs:
            cluster_freq_words[cluster_id] = []
            continue

        try:
            tfidf = TfidfVectorizer(stop_words=french_stop_words, max_features=1000)
            tfidf_matrix = tfidf.fit_transform(cluster_docs)
            summed = tfidf_matrix.sum(axis=0)
            top_indices = np.argsort(summed.A1)[::-1][:top_n]
            keywords = [tfidf.get_feature_names_out()[i] for i in top_indices]
            cluster_freq_words[cluster_id] = keywords
        except Exception as e:
            print(f"Error processing cluster {cluster_id}: {e}")
            cluster_freq_words[cluster_id] = []

    return cluster_freq_words

In [150]:
# Load your DataFrame with embeddings
# df_emails_with_embeddings = pd.read_pickle('data/Projects/Projet Demo/emails_with_embeddings.pkl')

# First, let's extract the embeddings into a proper numpy array

# Create embeddings_values column if it doesn't exist
if 'embeddings_values' not in df_emails_with_embeddings.columns:
    df_emails_with_embeddings['embeddings_values'] = df_emails_with_embeddings['embeddings'].apply(convert_embedding_to_array)

# Filter out rows with None/NaN embeddings
valid_embeddings_mask = df_emails_with_embeddings['embeddings_values'].notna()
filtered_df = df_emails_with_embeddings[valid_embeddings_mask].copy()

print(f"Working with {len(filtered_df)} emails that have valid embeddings")


# Convert the list of embeddings to a numpy array for clustering
embeddings_array = np.array(filtered_df['embeddings_values'].tolist())

# ----- Clustering -----
n_clusters = 8
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters_id_list = kmeans.fit_predict(embeddings_array)

clusters_id_list = [str(int(cluster_id)) for cluster_id in clusters_id_list]

# Convert cluster labels to strings if needed

print(clusters_id_list)
# Add cluster labels back to the DataFrame
filtered_df['cluster_id'] = clusters_id_list


Working with 16198 emails that have valid embeddings
['4', '4', '2', '2', '2', '2', '2', '4', '2', '2', '2', '2', '2', '3', '2', '6', '2', '6', '2', '6', '6', '6', '6', '6', '6', '5', '6', '6', '2', '2', '5', '6', '6', '5', '5', '6', '5', '5', '5', '6', '6', '6', '5', '6', '2', '6', '6', '5', '6', '6', '5', '5', '6', '5', '3', '4', '5', '6', '5', '4', '5', '5', '5', '5', '5', '5', '3', '4', '2', '2', '2', '2', '2', '2', '2', '3', '2', '4', '2', '2', '2', '3', '2', '6', '3', '6', '6', '5', '6', '5', '5', '5', '6', '5', '4', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '3', '5', '6', '5', '4', '5', '5', '6', '5', '6', '5', '5', '5', '5', '5', '5', '5', '5', '2', '4', '2', '4', '4', '2', '2', '4', '2', '4', '2', '2', '2', '3', '2', '4', '4', '2', '3', '2', '2', '2', '2', '2', '2', '6', '6', '6', '2', '2', '2', '6', '6', '6', '6', '5', '6', '6', '3', '2', '6', '6', '6', '6', '2', '6', '5', '6', '6', '6', '6', '6', '5', '

In [151]:
type(clusters_id_list[0])

str

In [None]:
# Get email texts from the filtered DataFrame
email_texts = filtered_df["body"].tolist()
cluster_labels_list = filtered_df["cluster_id"].tolist()

# Get cluster names and frequent words
cluster_freq_words = get_top_keywords_per_cluster(email_texts, clusters_id_list)
print(cluster_freq_words)


Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens ['aujourd', 'chose', 'côté', 'dehors', 'delà', 'hui', 'intérieur', 'lieu', 'même', 'quelqu', 'quelque', 'travers', 'vivre'] not in stop_words.



{'0': ['archivistes', 'guyon', 'association', 'présidente', 'paris', 'standard', 'presidence', 'amicalement', 'archives', 'message', 'cordialement', 'journée', 'été', 'article', 'fait', 'cher', 'texte', 'réunion', 'joint', 'ci', 'échanger', 'projet', 'deux', 'chère', 'documents', 'accès', 'conseil', 'parfait', 'question', 'amitiés'], '1': ['archivistes', 'guyon', 'paris', 'association', 'présidente', 'presidence', 'standard', 'amicalement', 'journée', 'message', 'archives', 'alice', 'parfait', 'fait', 'midi', 'réunion', 'administrateurs', 'bureau', 'deux', 'super', 'temps', 'peu', 'serait', 'voir', 'twitter', 'laisse', 'voici', 'plaisir', 'adhérents', 'peux'], '2': ['archivistes', 'journée', 'association', 'message', 'paris', 'guyon', 'archives', 'présidente', 'standard', 'formation', 'réunion', 'presidence', 'publications', 'site', 'fait', 'ligne', 'avons', 'origine', 'sujet', 'directe', 'mail', 'déléguée', 'également', 'myriam', 'bureau', 'temps', 'été', 'midi', 'amicalement', 'peux'

In [153]:
filtered_df

Unnamed: 0,message_id,body,original_body,embeddings,processed_body,embeddings_values,cluster_id
0,<8aa028c40ef7146ed18990fc0c9e7a19@archivistes.be>,"Bonjour, Je vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de c...","Bonjour,\nJe vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de ...","{'object': 'embedding', 'index': 0, 'embedding': [0.15065436, -0.16268617, 0.099050604, 0.034771...","Bonjour, Je vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de c...","[0.15065436, -0.16268617, 0.099050604, 0.03477189, -0.003733333, -0.12344437, -0.033047672, 0.08...",4
1,<CE817F92-BE3D-42A0-A833-A08C3B1E7C4A@free.fr>,"Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","{'object': 'embedding', 'index': 1, 'embedding': [0.14858536, -0.17183514, 0.08667003, 0.0430086...","Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","[0.14858536, -0.17183514, 0.08667003, 0.043008618, 0.0032017394, -0.12493396, -0.0075426535, 0.1...",4
2,<01ea01d700bc$2453c3c0$6cfb4b40$@archivistes.qc.ca>,"Bonjour Céline et merci beaucoup !Ce sera un plaisir de vous retrouver, en effet. Bien cordialem...","Bonjour Céline et merci beaucoup !Ce sera un plaisir de vous retrouver, en effet. Bien cordialem...","{'object': 'embedding', 'index': 2, 'embedding': [0.14658116, -0.16300967, 0.1005806, 0.03991575...","Bonjour Céline et merci beaucoup !Ce sera un plaisir de vous retrouver, en effet. Bien cordialem...","[0.14658116, -0.16300967, 0.1005806, 0.039915755, -0.03623631, -0.12074596, -0.02459933, 0.10424...",2
3,<014d01d6f97f$b6450df0$22cf29d0$@archivistes.qc.ca>,"Bonjour, C'est avec plaisir que nous vous compterons parmi nous pour le prochain congrès de l'AA...","Bonjour, C'est avec plaisir que nous vous compterons parmi nous pour le prochain congrès de l'AA...","{'object': 'embedding', 'index': 3, 'embedding': [0.17201696, -0.16402929, 0.112226546, 0.046063...","Bonjour, C'est avec plaisir que nous vous compterons parmi nous pour le prochain congrès de l'AA...","[0.17201696, -0.16402929, 0.112226546, 0.046063617, -0.018504383, -0.12524809, -0.03254767, 0.09...",2
4,<cf77b68bfb2844369ec522a5aac4205a@archivistes.org>,"Bonjour Vincent, 11h ou 11h30, cela te conviendrait ? Bien à toi Anne ClercDéléguée GénéraleAsso...","Bonjour Vincent, 11h ou 11h30, cela te conviendrait ? Bien à toi Anne ClercDéléguée GénéraleAs...","{'object': 'embedding', 'index': 4, 'embedding': [0.151289, -0.16139962, 0.107311085, 0.04889077...","Bonjour Vincent, 11h ou 11h30, cela te conviendrait ? Bien à toi Anne ClercDéléguée GénéraleAsso...","[0.151289, -0.16139962, 0.107311085, 0.04889077, -0.036429998, -0.11085595, -0.013119226, 0.1119...",2
...,...,...,...,...,...,...,...
18973,<a09073a0f0334be8bd8a888a9995e15a@nouvelle-aquitaine.fr>,Présente ! Myriam FAVREAU Chargée de mission études et recherches Site de Poitiers Service Patri...,Présente !\nMyriam FAVREAU\nChargée de mission études et recherches\nSite de Poitiers\nService P...,"{'object': 'embedding', 'index': 3, 'embedding': [0.119224735, -0.1908262, 0.16395493, 0.1080550...",Présente ! Myriam FAVREAU Chargée de mission études et recherches Site de Poitiers Service Patri...,"[0.119224735, -0.1908262, 0.16395493, 0.108055055, -0.100029305, -0.08729087, -0.03536176, 0.114...",1
18974,<f3f3dd29a5d4433990d42f9b02efea27@mairie-lyon.fr>,"Bonjour, Je serai là. Bonne journée. Louis Faivre d'Arcier Directeur des Archives municipales de...","Bonjour,\nJe serai là.\nBonne journée.\nLouis Faivre d'Arcier\nDirecteur des Archives municipale...","{'object': 'embedding', 'index': 4, 'embedding': [0.124798544, -0.18187873, 0.15561144, 0.112304...","Bonjour, Je serai là. Bonne journée. Louis Faivre d'Arcier Directeur des Archives municipales de...","[0.124798544, -0.18187873, 0.15561144, 0.11230402, -0.09190622, -0.09645757, -0.022378646, 0.118...",1
18975,<413958f15c4f468298ba9389ecbdbfbf@archivistes.org>,"Bonjour La jauge pour l’AG est de 70 personnes, aussi pouvez-vous me confirmer au plus vite par ...","Bonjour\nLa jauge pour l’AG est de 70 personnes, aussi pouvez-vous me confirmer au plus vite par...","{'object': 'embedding', 'index': 5, 'embedding': [0.13658817, -0.18525131, 0.15825215, 0.1021218...","Bonjour La jauge pour l’AG est de 70 personnes, aussi pouvez-vous me confirmer au plus vite par ...","[0.13658817, -0.18525131, 0.15825215, 0.10212182, -0.08291965, -0.10004355, -0.026666656, 0.1128...",1
18976,<1402ba6a-8a47-2b46-33a8-acb468a11a1e@gmail.com>,Bonjour à toutes,"Bonjour à toutes les\ntrois,\nMalheureusement impossible pour moi, car je serai dans un avion de...","{'object': 'embedding', 'index': 6, 'embedding': [0.11375994, -0.16344212, 0.16088776, 0.0747878...",Bonjour à toutes,"[0.11375994, -0.16344212, 0.16088776, 0.07478782, -0.050877243, -0.105056904, -0.018680988, 0.14...",3


In [154]:
cluster_freq_words

{'0': ['archivistes',
  'guyon',
  'association',
  'présidente',
  'paris',
  'standard',
  'presidence',
  'amicalement',
  'archives',
  'message',
  'cordialement',
  'journée',
  'été',
  'article',
  'fait',
  'cher',
  'texte',
  'réunion',
  'joint',
  'ci',
  'échanger',
  'projet',
  'deux',
  'chère',
  'documents',
  'accès',
  'conseil',
  'parfait',
  'question',
  'amitiés'],
 '1': ['archivistes',
  'guyon',
  'paris',
  'association',
  'présidente',
  'presidence',
  'standard',
  'amicalement',
  'journée',
  'message',
  'archives',
  'alice',
  'parfait',
  'fait',
  'midi',
  'réunion',
  'administrateurs',
  'bureau',
  'deux',
  'super',
  'temps',
  'peu',
  'serait',
  'voir',
  'twitter',
  'laisse',
  'voici',
  'plaisir',
  'adhérents',
  'peux'],
 '2': ['archivistes',
  'journée',
  'association',
  'message',
  'paris',
  'guyon',
  'archives',
  'présidente',
  'standard',
  'formation',
  'réunion',
  'presidence',
  'publications',
  'site',
  'fait',
 

In [155]:

def get_labels_from_llm(email_texts, clusters_id_list, freq_words_dict, mails_read_by_cluster=20):
    # Prepare all clusters data for a single prompt
    all_clusters_info = {}

    for cluster_id in freq_words_dict:
        # Get sample emails for this cluster and shuffle them
        cluster_mails_indices = [i for i in range(len(email_texts))
                                if clusters_id_list[i] == cluster_id]
        # Shuffle the indices to get random emails
        # REMOVING SHUFFLING TO EXECUTE IT AT START OF DF LOADING
        # random.shuffle(cluster_mails_indices)
        # Take only up to mails_read_by_cluster emails
        cluster_mails = [email_texts[i] for i in cluster_mails_indices[:mails_read_by_cluster]]

        # Store the data for this cluster
        all_clusters_info[cluster_id] = {
            "freq_words": freq_words_dict[cluster_id],
            "sample_mails": cluster_mails
        }

    system_prompt = fr"""You are a french language expert tasked with attributing labels to multiple TF-IDF clusters at once.
    The clusters data can be found between the tags <clusters_data></clusters_data>.
    Each cluster has its frequent words and sample emails to help you find appropriate labels.

    To execute your task perfectly, you MUST follow these rules exactly:

    <rules>
    1. You must return ONLY a valid JSON object with NO additional text before or after.
    2. The JSON must use double quotes (") not single quotes (').
    3. Every cluster ID must be a string key in the JSON like "0", "1", etc.
    4. Every label must be a short French phrase (1-4 words).
    </rules>

    DO NOT include any explanation or other text in your response. ONLY return the JSON object.
    """

    # Build a single user prompt with all clusters
    user_prompt = "<clusters_data>\n"

    for cluster_id, data in all_clusters_info.items():
        user_prompt += f"CLUSTER {cluster_id}:\n"
        user_prompt += f"Frequent words: {data['freq_words']}\n"
        user_prompt += "Sample emails:\n"

        for i, mail in enumerate(data['sample_mails']):
            # Truncate long emails to prevent token explosion
            truncated_mail = mail[:300] + "..." if len(mail) > 300 else mail
            user_prompt += f"- Email {i+1}: {truncated_mail}\n"

        user_prompt += "\n---\n\n"

    user_prompt += "</clusters_data>"

    model = "google/gemini-flash-1.5"

    # Call the LLM API once with all clusters
    response = openrouter_llm_api_call(system_prompt, user_prompt, model)
    print(f"LLM response: {response}")
    print(type(response))
    # Parse the JSON response
    try:
        import json
        llm_labels = json.loads(response)
        print(f"Successfully obtained labels for {len(llm_labels)} clusters at once")
    except json.JSONDecodeError:
        print("Error parsing LLM response as JSON.")
        return False
        # # Fallback to basic labels
        # llm_labels = {str(cluster_id): f"Groupe {cluster_id}" for cluster_id in freq_words_dict.keys()}

    return llm_labels

In [156]:
# ----- LLM Cluster Labeling -----

# Get LLM-generated labels for each cluster
llm_cluster_labels = get_labels_from_llm(email_texts, cluster_labels_list, cluster_freq_words)
print(llm_cluster_labels)


{
    "0": "Gestion des archives",
    "1": "Réunions administratives",
    "2": "Événements archivistiques",
    "3": "Messages amicaux",
    "4": "Informations partagées",
    "5": "Discussions internes",
    "6": "Communication sur le site",
    "7": "Défense des archives"
}
LLM response: {
    "0": "Gestion des archives",
    "1": "Réunions administratives",
    "2": "Événements archivistiques",
    "3": "Messages amicaux",
    "4": "Informations partagées",
    "5": "Discussions internes",
    "6": "Communication sur le site",
    "7": "Défense des archives"
}
<class 'str'>
Successfully obtained labels for 8 clusters at once
{'0': 'Gestion des archives', '1': 'Réunions administratives', '2': 'Événements archivistiques', '3': 'Messages amicaux', '4': 'Informations partagées', '5': 'Discussions internes', '6': 'Communication sur le site', '7': 'Défense des archives'}


In [157]:
# When mapping to cluster names, convert float->int->str
filtered_df['cluster_name'] = filtered_df['cluster_id'].map(llm_cluster_labels)
print(filtered_df['cluster_name'].value_counts())

# Store clean LLM labels separately (without the keywords)
clean_llm_labels = {k: v for k, v in llm_cluster_labels.items()}

cluster_name
Communication sur le site    2924
Réunions administratives     2511
Événements archivistiques    2365
Discussions internes         2269
Gestion des archives         1898
Défense des archives         1699
Informations partagées       1576
Messages amicaux              956
Name: count, dtype: int64


In [158]:
# Make sure cluster_names has all the keys from llm_cluster_labels
# for cluster_id in llm_cluster_labels.keys():
#     if cluster_id not in clusters_id_list:
#         clusters_id_list[cluster_id] = f"Cluster {cluster_id}"

# clusters_id_list
# Update cluster names with LLM labels
# for cluster_id, llm_label in llm_cluster_labels.items():
#     print(cluster_id, llm_label)
#     clusters_id_list[cluster_id] = f"{llm_label}: {clusters_id_list[cluster_id]}"

In [159]:
df_emails_with_embeddings["embeddings"][0]

{'object': 'embedding',
 'index': 0,
 'embedding': [0.15065436,
  -0.16268617,
  0.099050604,
  0.03477189,
  -0.003733333,
  -0.12344437,
  -0.033047672,
  0.08639388,
  0.07059558,
  0.036801767,
  0.0110140145,
  0.17315991,
  0.0028508403,
  -0.09954481,
  -0.12614585,
  -0.033550046,
  -0.16921239,
  0.059269827,
  0.0220447,
  -0.024569465,
  -0.09214796,
  0.025691787,
  -0.16790995,
  0.049062815,
  -0.1323379,
  0.078855,
  0.0066578095,
  0.046525907,
  -0.109608896,
  0.07837187,
  0.08325244,
  -0.069748595,
  -0.046596613,
  -0.07375803,
  -0.0038205849,
  0.035307538,
  0.09114347,
  0.007004371,
  -0.06440844,
  -0.017492728,
  -0.0040773703,
  -0.048282266,
  0.001819156,
  -0.030808378,
  0.033511803,
  0.040530127,
  -0.03479831,
  0.0062421206,
  -0.024664871,
  0.021384241,
  -0.015254021,
  -0.0012801505,
  -0.013874431,
  0.06111627,
  0.05350498,
  -0.04850192,
  -0.04800082,
  -0.04641629,
  -0.01694679,
  -0.026982069,
  -0.064812176,
  0.045197316,
  -0.012402

In [160]:
# ----- TSNE -----
# Adjust perplexity based on the number of data points
perplexity = min(30, max(5, len(filtered_df) // 10))
print(f"Using TSNE with perplexity={perplexity}")

tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
embeddings_2d = tsne.fit_transform(embeddings_array)

Using TSNE with perplexity=30


In [161]:
# # ----- Print cluster labels -----
# print("\n📚 Cluster labels:")
# for cluster_id, name in clean_llm_labels.items():
#     # Count how many emails are in each cluster
#     count = np.sum(clusters_id_list == cluster_id)
#     print("Cluster id:", cluster_id)
#     print("Cluster label:", clusters_id_list)

#     print(f"Cluster {cluster_id} ({count} emails): {name}")


In [162]:
# Save the results to the DataFrame
df_emails_with_embeddings['cluster_id'] = np.nan
df_emails_with_embeddings.loc[valid_embeddings_mask, 'cluster_id'] = clusters_id_list
df_emails_with_embeddings['cluster_name'] = filtered_df['cluster_id'].map(llm_cluster_labels)
df_emails_with_embeddings.to_pickle('data/Projects/Projet Demo/emails_with_clusters.pkl')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value '['4', '4', '2', '2', '2', '2', '2', '4', '2', '2', '2', '2', '2', '3', '2', '6', '2', '6', '2', '6', '6', '6', '6', '6', '6', '5', '6', '6', '2', '2', '5', '6', '6', '5', '5', '6', '5', '5', '5', '6', '6', '6', '5', '6', '2', '6', '6', '5', '6', '6', '5', '5', '6', '5', '3', '4', '5', '6', '5', '4', '5', '5', '5', '5', '5', '5', '3', '4', '2', '2', '2', '2', '2', '2', '2', '3', '2', '4', '2', '2', '2', '3', '2', '6', '3', '6', '6', '5', '6', '5', '5', '5', '6', '5', '4', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '6', '3', '5', '6', '5', '4', '5'

In [163]:

# ----- Plotly Interactive Visualization with Discrete Color Scale -----
# Create a DataFrame for plotly with cluster as string to use discrete colors
tsne_df = pd.DataFrame({
    'TSNE1': embeddings_2d[:, 0],
    'TSNE2': embeddings_2d[:, 1],
    'cluster_id': clusters_id_list,  # Keep numeric ID for reference
    'cluster': [f"Cluster {i}" for i in clusters_id_list],  # Convert to string for discrete color scale
    'subject': filtered_df['subject'].values if 'subject' in filtered_df.columns else [f"Email {i}" for i in range(len(embeddings_2d))]
})

# Add only the LLM labels to the DataFrame for hover info (without "Cluster X:" prefix)
tsne_df['cluster_label'] = tsne_df['cluster_id'].apply(lambda x: clean_llm_labels.get(x, "Unlabeled"))

# Create a dictionary mapping cluster labels to just their LLM names for the legend
cluster_names_dict = {f"Cluster {k}": v for k, v in clean_llm_labels.items()}

# Create an interactive scatter plot with discrete color scale
fig = px.scatter(
    tsne_df,
    x='TSNE1',
    y='TSNE2',
    color='cluster',
    hover_data=['subject', 'cluster_label'],
    title='Interactive Email Clusters (TSNE)',
    color_discrete_sequence=px.colors.qualitative.Vivid,  # Use a discrete color sequence
    category_orders={'cluster': [f"Cluster {i}" for i in range(n_clusters)]}  # Ensure consistent ordering
)

# Update hover template to show subject and LLM label (not cluster number)
fig.update_traces(
    hovertemplate="<b>%{customdata[0]}</b><br>%{customdata[1]}<extra></extra>",
    customdata=np.column_stack((
        tsne_df['subject'],
        tsne_df['cluster_id'].apply(lambda x: clean_llm_labels.get(x, f"Unlabeled"))
    )),
)

# Improve layout with better legend positioning and formatting
fig.update_layout(
    legend_title_text='Clusters',
    xaxis_title='TSNE Dimension 1',
    yaxis_title='TSNE Dimension 2',
    legend=dict(
        title_font=dict(size=14),
        font=dict(size=12),
        itemsizing='constant',  # Make legend items consistent size
        orientation='v',  # Vertical orientation
        yanchor="top",
        y=1.0,
        xanchor="right",
        x=1.15,  # Position legend outside the plot area
        bordercolor="Black",
        borderwidth=1
    ),
    # Make the plot area slightly smaller to accommodate the legend
    margin=dict(r=150),  # Add right margin for legend
)

# Update the legend labels to show only the LLM-generated names without "Cluster X:" prefix
for i, label in enumerate(fig.data):
    # print(label)
    cluster_id = label.name.replace('Cluster ', '')
    print(cluster_id)
    new_name = f"{clean_llm_labels.get(cluster_id, '')}"
    label.name = new_name
    print(label)

# Save as HTML file for interactive viewing
fig.write_html('email_clusters_interactive.html')
print("Created interactive visualization: email_clusters_interactive.html")

0
Scattergl({
    'customdata': array([['Email 0', 'Informations partagées'],
                         ['Email 1', 'Informations partagées'],
                         ['Email 2', 'Événements archivistiques'],
                         ...,
                         ['Email 16195', 'Réunions administratives'],
                         ['Email 16196', 'Messages amicaux'],
                         ['Email 16197', 'Réunions administratives']], dtype=object),
    'hovertemplate': '<b>%{customdata[0]}</b><br>%{customdata[1]}<extra></extra>',
    'legendgroup': 'Cluster 0',
    'marker': {'color': 'rgb(229, 134, 6)', 'symbol': 'circle'},
    'mode': 'markers',
    'name': 'Gestion des archives',
    'showlegend': True,
    'x': array([ 15.863159, -10.864862,  12.839756, ..., -58.77794 , -58.94043 ,
                 41.50663 ], dtype=float32),
    'xaxis': 'x',
    'y': array([64.36893 , 28.733082, 41.944027, ..., 52.59011 , 52.891815, 14.232793],
               dtype=float32),
    'yaxis': 'y'


In [164]:
print("1364 :",filtered_df.iloc[1364][["cluster_id", "cluster_name"]])
print("1384 :",filtered_df.iloc[1384][["cluster_id", "cluster_name"]])

1364 : cluster_id                           4
cluster_name    Informations partagées
Name: 1765, dtype: object
1384 : cluster_id                              6
cluster_name    Communication sur le site
Name: 1785, dtype: object


In [165]:
clean_llm_labels.get('3', '')

'Messages amicaux'

In [166]:
# After running t-SNE and before visualization
# Print the first few emails with their cluster assignments and 2D coordinates
for i in range(min(5, len(embeddings_2d))):
    print(f"Email {i}: Cluster {clusters_id_list[i]} - Position: {embeddings_2d[i]}")

Email 0: Cluster 4 - Position: [ 40.75254  -60.693478]
Email 1: Cluster 4 - Position: [ 40.79669  -60.765213]
Email 2: Cluster 2 - Position: [ 41.348373 -60.643833]
Email 3: Cluster 2 - Position: [ 41.269936 -60.531197]
Email 4: Cluster 2 - Position: [ 41.769188 -60.509594]


In [167]:
label

Scattergl({
    'customdata': array([['Email 0', 'Informations partagées'],
                         ['Email 1', 'Informations partagées'],
                         ['Email 2', 'Événements archivistiques'],
                         ...,
                         ['Email 16195', 'Réunions administratives'],
                         ['Email 16196', 'Messages amicaux'],
                         ['Email 16197', 'Réunions administratives']], dtype=object),
    'hovertemplate': '<b>%{customdata[0]}</b><br>%{customdata[1]}<extra></extra>',
    'legendgroup': 'Cluster 7',
    'marker': {'color': 'rgb(47, 138, 196)', 'symbol': 'circle'},
    'mode': 'markers',
    'name': 'Défense des archives',
    'showlegend': True,
    'x': array([ 13.695633, -12.013175, -12.419   , ..., -66.6985  , -60.773136,
                -60.711647], dtype=float32),
    'xaxis': 'x',
    'y': array([ 1.2524035, -6.107307 , -5.8274293, ..., 62.446014 , 52.334713 ,
                52.609318 ], dtype=float32),
    'yaxis'

In [168]:
df_emails_with_embeddings

Unnamed: 0,message_id,body,original_body,embeddings,processed_body,embeddings_values,cluster_id,cluster_name
0,<8aa028c40ef7146ed18990fc0c9e7a19@archivistes.be>,"Bonjour, Je vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de c...","Bonjour,\nJe vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de ...","{'object': 'embedding', 'index': 0, 'embedding': [0.15065436, -0.16268617, 0.099050604, 0.034771...","Bonjour, Je vous remercie pour ce complément d'informations. Nous nous réjouissons d'avance de c...","[0.15065436, -0.16268617, 0.099050604, 0.03477189, -0.003733333, -0.12344437, -0.033047672, 0.08...",4,Informations partagées
1,<CE817F92-BE3D-42A0-A833-A08C3B1E7C4A@free.fr>,"Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","{'object': 'embedding', 'index': 1, 'embedding': [0.14858536, -0.17183514, 0.08667003, 0.0430086...","Bonjour Céline, l’AAF a-t-elle prévu de faire un communiqué de presse sur l’attentat de vendredi...","[0.14858536, -0.17183514, 0.08667003, 0.043008618, 0.0032017394, -0.12493396, -0.0075426535, 0.1...",4,Informations partagées
2,<01ea01d700bc$2453c3c0$6cfb4b40$@archivistes.qc.ca>,"Bonjour Céline et merci beaucoup !Ce sera un plaisir de vous retrouver, en effet. Bien cordialem...","Bonjour Céline et merci beaucoup !Ce sera un plaisir de vous retrouver, en effet. Bien cordialem...","{'object': 'embedding', 'index': 2, 'embedding': [0.14658116, -0.16300967, 0.1005806, 0.03991575...","Bonjour Céline et merci beaucoup !Ce sera un plaisir de vous retrouver, en effet. Bien cordialem...","[0.14658116, -0.16300967, 0.1005806, 0.039915755, -0.03623631, -0.12074596, -0.02459933, 0.10424...",2,Événements archivistiques
3,<014d01d6f97f$b6450df0$22cf29d0$@archivistes.qc.ca>,"Bonjour, C'est avec plaisir que nous vous compterons parmi nous pour le prochain congrès de l'AA...","Bonjour, C'est avec plaisir que nous vous compterons parmi nous pour le prochain congrès de l'AA...","{'object': 'embedding', 'index': 3, 'embedding': [0.17201696, -0.16402929, 0.112226546, 0.046063...","Bonjour, C'est avec plaisir que nous vous compterons parmi nous pour le prochain congrès de l'AA...","[0.17201696, -0.16402929, 0.112226546, 0.046063617, -0.018504383, -0.12524809, -0.03254767, 0.09...",2,Événements archivistiques
4,<cf77b68bfb2844369ec522a5aac4205a@archivistes.org>,"Bonjour Vincent, 11h ou 11h30, cela te conviendrait ? Bien à toi Anne ClercDéléguée GénéraleAsso...","Bonjour Vincent, 11h ou 11h30, cela te conviendrait ? Bien à toi Anne ClercDéléguée GénéraleAs...","{'object': 'embedding', 'index': 4, 'embedding': [0.151289, -0.16139962, 0.107311085, 0.04889077...","Bonjour Vincent, 11h ou 11h30, cela te conviendrait ? Bien à toi Anne ClercDéléguée GénéraleAsso...","[0.151289, -0.16139962, 0.107311085, 0.04889077, -0.036429998, -0.11085595, -0.013119226, 0.1119...",2,Événements archivistiques
...,...,...,...,...,...,...,...,...
18973,<a09073a0f0334be8bd8a888a9995e15a@nouvelle-aquitaine.fr>,Présente ! Myriam FAVREAU Chargée de mission études et recherches Site de Poitiers Service Patri...,Présente !\nMyriam FAVREAU\nChargée de mission études et recherches\nSite de Poitiers\nService P...,"{'object': 'embedding', 'index': 3, 'embedding': [0.119224735, -0.1908262, 0.16395493, 0.1080550...",Présente ! Myriam FAVREAU Chargée de mission études et recherches Site de Poitiers Service Patri...,"[0.119224735, -0.1908262, 0.16395493, 0.108055055, -0.100029305, -0.08729087, -0.03536176, 0.114...",1,Réunions administratives
18974,<f3f3dd29a5d4433990d42f9b02efea27@mairie-lyon.fr>,"Bonjour, Je serai là. Bonne journée. Louis Faivre d'Arcier Directeur des Archives municipales de...","Bonjour,\nJe serai là.\nBonne journée.\nLouis Faivre d'Arcier\nDirecteur des Archives municipale...","{'object': 'embedding', 'index': 4, 'embedding': [0.124798544, -0.18187873, 0.15561144, 0.112304...","Bonjour, Je serai là. Bonne journée. Louis Faivre d'Arcier Directeur des Archives municipales de...","[0.124798544, -0.18187873, 0.15561144, 0.11230402, -0.09190622, -0.09645757, -0.022378646, 0.118...",1,Réunions administratives
18975,<413958f15c4f468298ba9389ecbdbfbf@archivistes.org>,"Bonjour La jauge pour l’AG est de 70 personnes, aussi pouvez-vous me confirmer au plus vite par ...","Bonjour\nLa jauge pour l’AG est de 70 personnes, aussi pouvez-vous me confirmer au plus vite par...","{'object': 'embedding', 'index': 5, 'embedding': [0.13658817, -0.18525131, 0.15825215, 0.1021218...","Bonjour La jauge pour l’AG est de 70 personnes, aussi pouvez-vous me confirmer au plus vite par ...","[0.13658817, -0.18525131, 0.15825215, 0.10212182, -0.08291965, -0.10004355, -0.026666656, 0.1128...",1,Réunions administratives
18976,<1402ba6a-8a47-2b46-33a8-acb468a11a1e@gmail.com>,Bonjour à toutes,"Bonjour à toutes les\ntrois,\nMalheureusement impossible pour moi, car je serai dans un avion de...","{'object': 'embedding', 'index': 6, 'embedding': [0.11375994, -0.16344212, 0.16088776, 0.0747878...",Bonjour à toutes,"[0.11375994, -0.16344212, 0.16088776, 0.07478782, -0.050877243, -0.105056904, -0.018680988, 0.14...",3,Messages amicaux


In [169]:
tsne_df["cluster"]

0        Cluster 4
1        Cluster 4
2        Cluster 2
3        Cluster 2
4        Cluster 2
           ...    
16193    Cluster 1
16194    Cluster 1
16195    Cluster 1
16196    Cluster 3
16197    Cluster 1
Name: cluster, Length: 16198, dtype: object