In [None]:
# @misc{email-tuned-bge-m3,
#   author = {doubleyyh},
#   title = {Email-tuned BGE-M3: Fine-tuned Embedding Model for Email Content},
#   year = {2024},
#   publisher = {HuggingFace}
# }

In [None]:
from src.data.email_analyzer import EmailAnalyzer
import pandas as pd
import tqdm

In [None]:
### Test des fonctions:

# mbox_single_file_path = 'data/processed/mailbox_cecile/AG.mbox'
# mbox_path = 'data/processed/mailbox_cecile/'
TEST_SAMPLE_PATH = 'data/processed/celine_readpst_with_S/celine.guyon/Archive'
db_path = 'data/Projects/database copy.duckdb'

In [None]:
analyzer =EmailAnalyzer(db_path)
analyzer.get_email_summary()

df_db_cleaned = analyzer.export_to_dataframe()
df_db_cleaned

CatalogException: Catalog Error: Table with name receiver_emails does not exist!
Did you mean "sqlite_schema"?

LINE 1: SELECT COUNT(*) FROM receiver_emails
                             ^

In [None]:
# If you want to try with GPU but more conservatively:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer('doubleyyh/mixed-bge-m3-email', device=device)

# Much smaller batch size for GPU to avoid memory issues
batch_size = 4

# Create embeddings
embeddings = []

for i in range(0, len(df_db_cleaned), batch_size):
    batch = df_db_cleaned['body'].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch)
    embeddings.extend(batch_embeddings)

    # Clear cache after each batch
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    if i % (batch_size * 5) == 0:
        print(f"Processed {i}/{len(df_db_cleaned)} emails")


# Save embeddings as numpy array
embeddings_array = np.array(embeddings)
np.save('email_embeddings.npy', embeddings_array)

# Add to your dataframe if needed
df_db_cleaned['embedding'] = list(embeddings_array)

# Save the enhanced dataframe
df_db_cleaned.to_pickle('emails_with_embeddings.pkl')  # Using pickle to preserve the embedding arrays

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


Processed 0/10 emails


In [None]:
## Similar mails

In [None]:
from scipy.spatial.distance import cosine

def search_similar_emails(query_text, top_n=5):
    # Encode the query
    query_embedding = model.encode(query_text)

    # Calculate similarity with all emails
    similarities = []
    for idx, email_embedding in enumerate(embeddings):
        similarity = 1 - cosine(query_embedding, email_embedding)  # Higher is more similar
        similarities.append((idx, similarity))

    # Sort by similarity (descending)
    similarities.sort(key=lambda x: x[1], reverse=True)

    # Return top N results
    results = []
    for idx, sim in similarities[:top_n]:
        results.append({
            'index': idx,
            'similarity': sim,
            'subject': df_db_cleaned['subject'].iloc[idx] if 'subject' in df_db_cleaned.columns else "N/A",
            'email_snippet': df_db_cleaned['body'].iloc[idx][:200] + "..."  # First 200 chars
        })

    return results

# Example usage
similar_emails = search_similar_emails("Question about project timeline")

In [None]:
similar_emails

[{'index': 7,
  'similarity': np.float32(0.39083397),
  'subject': 'RE: Table-ronde Congr√®s AAQ 2021 : Archives de la quarantaine',
  'email_snippet': '<meta http-equiv="Content-Type" content="text/html; charset=Windows-1252">\n<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsof...'},
 {'index': 0,
  'similarity': np.float32(0.3855061),
  'subject': 'RE: Facture MIC - facilitation Groupe de travail Relance du plaidoyer',
  'email_snippet': '<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:m...'},
 {'index': 9,
  'similarity': np.float32(0.3804798),
  'subject': 'RE: Facture MIC - facilitation Groupe de travail Relance du plaidoyer',
  'email_snippet': '<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schem

In [None]:
# Clustering

In [None]:
from sklearn.metrics.pairwise import cosine_distances

# Example usage of cosine distance
distance = cosine_distances([embedding1], [embedding2])


ValueError: All ufuncs must have type `numpy.ufunc`. Received (<ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>)

In [None]:
# Simple manual clustering using distance thresholds
import numpy as np
from scipy.spatial.distance import cosine

def simple_clustering(embeddings, threshold=0.3, max_clusters=20):
    """
    A simple greedy clustering algorithm based on cosine similarity
    """
    clusters = []
    cluster_centers = []

    # Initialize with first embedding as center of first cluster
    clusters.append([0])
    cluster_centers.append(embeddings[0])

    # Process each remaining embedding
    for i in range(1, len(embeddings)):
        # Find closest cluster
        best_similarity = -1
        best_cluster = -1

        for j, center in enumerate(cluster_centers):
            similarity = 1 - cosine(embeddings[i], center)
            if similarity > best_similarity:
                best_similarity = similarity
                best_cluster = j

        # Check if similarity is above threshold
        if best_similarity > threshold:
            # Add to existing cluster
            clusters[best_cluster].append(i)
            # Update center (average of all embeddings in cluster)
            cluster_embeddings = [embeddings[idx] for idx in clusters[best_cluster]]
            cluster_centers[best_cluster] = np.mean(cluster_embeddings, axis=0)
        elif len(clusters) < max_clusters:
            # Create new cluster
            clusters.append([i])
            cluster_centers.append(embeddings[i])
        else:
            # If max clusters reached, add to closest cluster anyway
            clusters[best_cluster].append(i)
            cluster_embeddings = [embeddings[idx] for idx in clusters[best_cluster]]
            cluster_centers[best_cluster] = np.mean(cluster_embeddings, axis=0)

    # Convert to cluster labels array
    labels = np.zeros(len(embeddings), dtype=int)
    for cluster_id, cluster_indices in enumerate(clusters):
        for idx in cluster_indices:
            labels[idx] = cluster_id

    return labels

# Run simple clustering
cluster_labels = simple_clustering(embeddings, threshold=0.5, max_clusters=15)
df_db_cleaned['cluster'] = cluster_labels

ValueError: All ufuncs must have type `numpy.ufunc`. Received (<ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>)

In [None]:
# First, let's check what we're working with
print(f"Type of embeddings[0]: {type(embeddings[0])}")
print(f"Length of embeddings: {len(embeddings)}")
print(f"Shape of first embedding: {np.array(embeddings[0]).shape}")

# Let's convert embeddings to a 2D array explicitly
embedding_dim = len(embeddings[0])
num_samples = len(embeddings)
embeddings_2d = np.zeros((num_samples, embedding_dim))

for i, emb in enumerate(embeddings):
    embeddings_2d[i] = emb

print(f"Shape of embeddings_2d: {embeddings_2d.shape}")

# Try a simpler clustering approach with fewer dependencies
from sklearn.cluster import MiniBatchKMeans

# Use MiniBatchKMeans which is more memory-efficient
mbk = MiniBatchKMeans(n_clusters=10, batch_size=100, random_state=42)
cluster_labels = mbk.fit_predict(embeddings_2d)

# Add cluster labels to dataframe
df_db_cleaned['cluster'] = cluster_labels

NameError: name 'embeddings' is not defined

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

# Normalize the embeddings
scaler = StandardScaler()
scaled_embeddings = scaler.fit_transform(embeddings_array)

# Use DBSCAN for clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)  # Adjust these parameters as needed
cluster_labels = dbscan.fit_predict(scaled_embeddings)

df_db_cleaned['cluster'] = cluster_labels

ValueError: All ufuncs must have type `numpy.ufunc`. Received (<ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>)

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Try hierarchical clustering instead
agg_clustering = AgglomerativeClustering(n_clusters=num_clusters)
cluster_labels = agg_clustering.fit_predict(embeddings_array)

df_db_cleaned['cluster'] = cluster_labels

ValueError: All ufuncs must have type `numpy.ufunc`. Received (<ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>)

In [None]:
from sklearn.cluster import KMeans
import numpy as np

# Make sure embeddings are properly formatted as a numpy array
# Sometimes embeddings from transformer models need to be converted properly
embeddings_array = np.array(embeddings)

# Check the shape
print(f"Embeddings shape: {embeddings_array.shape}")

# If the embeddings are lists inside a list, you might need to stack them properly
if isinstance(embeddings[0], list):
    embeddings_array = np.vstack(embeddings)

# Ensure data is in float format
embeddings_array = embeddings_array.astype(np.float32)

# Now perform clustering
num_clusters = 10  # Adjust based on your needs
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(embeddings_array)

# Add cluster labels to dataframe
df_db_cleaned['cluster'] = cluster_labels

# Analyze clusters
for cluster_id in range(num_clusters):
    cluster_emails = df_db_cleaned[df_db_cleaned['cluster'] == cluster_id]
    print(f"Cluster {cluster_id}: {len(cluster_emails)} emails")
    # Print a few sample subjects or first lines
    if 'subject' in df_db_cleaned.columns:
        print(cluster_emails['subject'].head(3).tolist())
    else:
        # If no subject column, print first few words of each email
        for body in cluster_emails['body'].head(3):
            print(body[:50] + "...")

ImportError: scipy.special._ufuncs_cxx does not export expected C variable _export_expit

In [None]:
from sklearn.cluster import KMeans

# Determine optimal number of clusters (simplified approach)
# For production, you might want to use methods like elbow method or silhouette analysis
num_clusters = 10  # Adjust based on your needs

# Perform clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
cluster_labels = kmeans.fit_predict(embeddings_array)

# Add cluster labels to dataframe
df_db_cleaned['cluster'] = cluster_labels

# Analyze clusters
for cluster_id in range(num_clusters):
    cluster_emails = df_db_cleaned[df_db_cleaned['cluster'] == cluster_id]
    print(f"Cluster {cluster_id}: {len(cluster_emails)} emails")
    # Print a few sample subjects or first lines to understand cluster theme
    if 'subject' in df_db_cleaned.columns:
        print(cluster_emails['subject'].head(3).tolist())

ImportError: scipy.special._ufuncs_cxx does not export expected C variable _export_expit

In [None]:
# Dim reduction for viz

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Reduce dimensions to 2D for visualization
tsne = TSNE(n_components=2, random_state=42)
embeddings_2d = tsne.fit_transform(embeddings_array)

# Plot
plt.figure(figsize=(12, 10))
plt.scatter(embeddings_2d[:, 0], embeddings_2d[:, 1], c=cluster_labels, cmap='viridis', alpha=0.5)
plt.colorbar(label='Cluster')
plt.title('Email Clusters Visualization')
plt.tight_layout()
plt.savefig('email_clusters.png')
plt.show()

ImportError: scipy.special._ufuncs_cxx does not export expected C variable _export_expit

In [None]:
# Topic modeling based on clusters

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def get_top_keywords(cluster_id, n_terms=5):
    # Get all emails in this cluster
    cluster_emails = df_db_cleaned[df_db_cleaned['cluster'] == cluster_id]['body'].tolist()

    # Extract keywords
    vectorizer = CountVectorizer(stop_words='english', max_features=500)
    X = vectorizer.fit_transform(cluster_emails)

    # Get top terms
    indices = X.sum(axis=0).argsort()[0, ::-1]
    features = vectorizer.get_feature_names_out()
    top_terms = [features[i] for i in indices[:n_terms]]

    return top_terms

# Get topics for each cluster
for cluster_id in range(num_clusters):
    keywords = get_top_keywords(cluster_id)
    print(f"Cluster {cluster_id} keywords: {', '.join(keywords)}")

ValueError: All ufuncs must have type `numpy.ufunc`. Received (<ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>, <ufunc 'sph_legendre_p'>)

In [None]:
# seems valid but sloww for embeddings, to use if first fails.

In [None]:
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
import torch

# Set the device to CPU to avoid GPU memory issues
device = 'cpu'
model = SentenceTransformer('doubleyyh/mixed-bge-m3-email', device=device)

# Reduce batch size further for CPU processing
batch_size = 16  # Smaller batch size for CPU

# Create embeddings using CPU
embeddings = []

# Process in smaller batches
for i in range(0, len(df_db_cleaned), batch_size):
    batch = df_db_cleaned['body'].iloc[i:i+batch_size].tolist()
    batch_embeddings = model.encode(batch, show_progress_bar=(i == 0))
    embeddings.extend(batch_embeddings)

    # Print progress more frequently
    if i % batch_size == 0:
        print(f"Processed {i}/{len(df_db_cleaned)} emails")

    # Explicitly clear CUDA cache if any GPU memory is being used
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Convert to numpy array
embeddings_array = np.array(embeddings)

# Save embeddings separately first before adding to dataframe
np.save('email_embeddings.npy', embeddings_array)

# Then add to dataframe if needed
df_db_cleaned['embedding'] = list(embeddings_array)

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.
Batches:   0%|          | 0/1 [04:26<?, ?it/s]


KeyboardInterrupt: 

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("doubleyyh/mixed-bge-m3-email")

sentences = [
    "That is a happy person",
    "That is a happy dog",
    "That is a very happy person",
    "Today is a sunny day"
]
embeddings = model.encode(sentences)

similarities = model.similarity(embeddings, embeddings)
print(similarities.shape)

Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


torch.Size([4, 4])


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

# Initialize the embedding model
embeddings = HuggingFaceEmbeddings(
    model_name="doubleyyh/email-tuned-bge-m3",
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': True}
)

# Example emails
emails = [
    {
        "subject": "ÌöåÏùò ÏùºÏ†ï Î≥ÄÍ≤Ω ÏïàÎÇ¥",
        "from": [["ÍπÄÏ≤†Ïàò", "kim@company.com"]],
        "to": [["Ïù¥ÏòÅÌù¨", "lee@company.com"]],
        "cc": [["Î∞ïÏßÄÏõê", "park@company.com"]],
        "date": "2024-03-26T10:00:00",
        "text_body": "ÏïàÎÖïÌïòÏÑ∏Ïöî, ÎÇ¥Ïùº ÏòàÏ†ïÎêú ÌîÑÎ°úÏ†ùÌä∏ ÎØ∏ÌåÖÏùÑ Ïò§ÌõÑ 2ÏãúÎ°ú Î≥ÄÍ≤ΩÌïòÍ≥†Ïûê Ìï©ÎãàÎã§."
    },
    {
        "subject": "Project Timeline Update",
        "from": [["John Smith", "john@company.com"]],
        "to": [["Team", "team@company.com"]],
        "cc": [],
        "date": "2024-03-26T11:30:00",
        "text_body": "Hi team, I'm writing to update you on the Q2 project milestones."
    }
]

# Format emails into documents
docs = []
for email in emails:
    # Format email content
    content = "\n".join([f"{k}: {v}" for k, v in email.items()])
    docs.append(Document(page_content=content))

# Create FAISS index
db = FAISS.from_documents(docs, embeddings)

# Query examples (supports both Korean and English)
queries = [
    "ÌöåÏùò ÏãúÍ∞ÑÏù¥ Ïñ∏Ï†úÎ°ú Î≥ÄÍ≤ΩÎêòÏóàÎÇòÏöî?",
    "When is the meeting rescheduled?",
    "ÌîÑÎ°úÏ†ùÌä∏ ÏùºÏ†ï",
    "Q2 milestones"
]

# Perform similarity search
for query in queries:
    print(f"\nQuery: {query}")
    results = db.similarity_search(query, k=1)
    print(f"Most relevant email:\n{results[0].page_content[:200]}...")


No sentence-transformers model found with name doubleyyh/email-tuned-bge-m3. Creating a new one with mean pooling.


OSError: doubleyyh/email-tuned-bge-m3 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
from sentence_transformers import SentenceTransformer

# Load the model
model = SentenceTransformer('doubleyyh/mixed-bge-m3-email')

# Example sentences
sentences = [
    "That is a happy person",
    "That is a happy dog",
    "That is a very happy person",
    "Today is a sunny day"
]

# Generate embeddings
embeddings = model.encode(sentences)

# Display embeddings
for sentence, embedding in zip(sentences, embeddings):
    print(f"Sentence: {sentence}")
    print(f"Embedding: {embedding[:5]}...")  # Displaying first 5 dimensions for brevity
    print()


Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


Sentence: That is a happy person
Embedding: [-0.00503     0.01627533 -0.06255302 -0.02993822 -0.00472386]...

Sentence: That is a happy dog
Embedding: [-0.02717782 -0.00619759 -0.05569847 -0.01241397 -0.0169634 ]...

Sentence: That is a very happy person
Embedding: [-0.006148    0.02535948 -0.06109168 -0.01726168 -0.00971865]...

Sentence: Today is a sunny day
Embedding: [-0.01058755  0.03318915 -0.06337968 -0.01158467 -0.02984716]...



In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document

# Initialize the embedding model with the correct model identifier
embeddings = HuggingFaceEmbeddings(
    model_name="doubleyyh/mixed-bge-m3-email",  # Use this model name instead of the incorrect one
    model_kwargs={'device': 'cuda'},
    encode_kwargs={'normalize_embeddings': True}
)

# Example emails
emails = [
    {
        "subject": "ÌöåÏùò ÏùºÏ†ï Î≥ÄÍ≤Ω ÏïàÎÇ¥",
        "from": [["ÍπÄÏ≤†Ïàò", "kim@company.com"]],
        "to": [["Ïù¥ÏòÅÌù¨", "lee@company.com"]],
        "cc": [["Î∞ïÏßÄÏõê", "park@company.com"]],
        "date": "2024-03-26T10:00:00",
        "text_body": "ÏïàÎÖïÌïòÏÑ∏Ïöî, ÎÇ¥Ïùº ÏòàÏ†ïÎêú ÌîÑÎ°úÏ†ùÌä∏ ÎØ∏ÌåÖÏùÑ Ïò§ÌõÑ 2ÏãúÎ°ú Î≥ÄÍ≤ΩÌïòÍ≥†Ïûê Ìï©ÎãàÎã§."
    },
    {
        "subject": "Project Timeline Update",
        "from": [["John Smith", "john@company.com"]],
        "to": [["Team", "team@company.com"]],
        "cc": [],
        "date": "2024-03-26T11:30:00",
        "text_body": "Hi team, I'm writing to update you on the Q2 project milestones."
    }
]

# Format emails into documents
docs = []
for email in emails:
    # Format email content
    content = "\n".join([f"{k}: {v}" for k, v in email.items()])
    docs.append(Document(page_content=content))

# Create FAISS index
db = FAISS.from_documents(docs, embeddings)

# Query examples (supports both Korean and English)
queries = [
    "ÌöåÏùò ÏãúÍ∞ÑÏù¥ Ïñ∏Ï†úÎ°ú Î≥ÄÍ≤ΩÎêòÏóàÎÇòÏöî?",
    "When is the meeting rescheduled?",
    "ÌîÑÎ°úÏ†ùÌä∏ ÏùºÏ†ï",
    "Q2 milestones"
]

# Perform similarity search
for query in queries:
    print(f"\nQuery: {query}")
    results = db.similarity_search(query, k=1)
    print(f"Most relevant email:\n{results[0].page_content[:200]}...")


Invalid model-index. Not loading eval results into CardData.
Invalid model-index. Not loading eval results into CardData.


ValueError: input not a numpy array