In [8]:
import os
import uuid
import chromadb
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from mailfox.vector import VectorDatabase
from mailfox.vector import FolderCluster

In [9]:
vdb = VectorDatabase(os.path.expanduser("~/.mailfox/data/email_db"))

In [15]:
vdb.emails_collection.get("0043c043d225da34424c38612ba0118cf5b98187dff5e13e52d6d2b228c0fbd0")

{'ids': ['0043c043d225da34424c38612ba0118cf5b98187dff5e13e52d6d2b228c0fbd0'],
 'embeddings': None,
 'metadatas': [{'body': '[image: Google]\r\nA new sign-in on Apple iPhone\r\n\r\n\r\nalex.k.korte@gmail.com\r\nWe noticed a new sign-in to your Google Account on a Apple iPhone device.\r\nIf this was you, you donâ\x80\x99t need to do anything. If not, weâ\x80\x99ll help you\r\nsecure your account.\r\nCheck activity\r\n\r\nYou can also see security activity at\r\nhttps://myaccount.google.com/notifications\r\nYou received this email to let you know about important changes to your\r\nGoogle Account and services.\r\nÂ© 2024 Google LLC, 1600 Amphitheatre Parkway, Mountain View, CA 94043, USA\r\n',
   'date': 'Thu, 13 Jun 2024 12:46:14',
   'folder': 'inbox',
   'from': 'Google <no-reply@accounts.google.com>',
   'raw_body': '[image: Google]\r\nA new sign-in on Apple iPhone\r\n\r\n\r\nalex.k.korte@gmail.com\r\nWe noticed a new sign-in to your Google Account on a Apple iPhone device.\r\nIf this 

In [18]:
vdb.emails_collection.update(['0043c043d225da34424c38612ba0118cf5b98187dff5e13e52d6d2b228c0fbd0', '00e5810f08cbaecbdf92f5035bee4d8e3af7696b7f7ced4d4fd5f07606cc4e37'], metadatas=[{"folder": "inbox"}, {"folder": "inbox"}])

In [16]:
vdb.emails_collection.peek(limit=10)

{'ids': ['0043c043d225da34424c38612ba0118cf5b98187dff5e13e52d6d2b228c0fbd0',
  '0049817bf5a58af5a1e4b60ac22b9407df1df0762abce87d6d722894094d9c9c',
  '004c0ef9939732092025216afa2995c6e5f27a9127f7ae821ed4e7551a893567',
  '0071d7ddd974fcdae2c648a4cc2ae8215af7113be9a1b310a5023d44abaadd63',
  '00918e785645699a1f89f6c2269f2bc2b2a7ddbd420e9a52e15f42a2c873f4ec',
  '00cfaed1ada418bb1d3dae45e038427543933ea59333596ac07f0d7463e6ed0d',
  '00e5810f08cbaecbdf92f5035bee4d8e3af7696b7f7ced4d4fd5f07606cc4e37',
  '00f7a0911c4b085e7683240963886fcb66fb3ed17f25a16de35c335d1f62f2c4',
  '0143f0b04850ea585bc619cac840cc3549febbb1c8199faed6ddf7a41831f3a8',
  '0146a9aedf18aa64a6c5a2029e2495027c7d47735a3336a198a06ccbbd3de262'],
 'embeddings': [[-0.07329989224672318,
   -0.04748128354549408,
   0.10577463358640671,
   -0.06297947466373444,
   0.0077379546128213406,
   -0.04300383850932121,
   0.07775163650512695,
   -0.06402145326137543,
   0.02497420459985733,
   0.012795300222933292,
   0.006789190694689751,
   -0

In [2]:

# Load the emails
df = pd.read_parquet('emails.parquet')
# create a new column 'ids' and fill it with random uuids
df['id'] = [str(uuid.uuid4()) for _ in range(len(df))]

FileNotFoundError: [Errno 2] No such file or directory: 'emails.parquet'

In [4]:
for i, row in tqdm(df.iterrows(), total=500):
    embeddings = vdb.default_ef([row['Body']])
    vdb.emails_collection.add(
        embeddings=embeddings,
        # documents=[row['Body']],
        ids=[row['uuid']],
    )
    
    if i > 500:
        break

  0%|          | 0/500 [00:00<?, ?it/s]

501it [00:39, 12.83it/s]                         


In [5]:
data = vdb.get_all_embeddings(vdb.emails_collection)
data['embeddings'].shape

(502, 384)

In [6]:
clustering = Clustering(data=data['embeddings'])

In [7]:
clustering.fit()

In [8]:
clustering.clusterer.centroids_.shape

(27, 101)

In [9]:
from sklearn.metrics import pairwise_distances
import numpy as np

# calculate pairwise distances
distances = pairwise_distances(clustering.data)

# calculate average distance
average_distance = np.mean(distances)

average_distance

1.2214278440376731

In [10]:
clustering.clusterer.labels_

array([-1,  0,  4,  9,  5, 22,  1, 25, 26, -1, -1, 10,  0, 23,  0,  2, 21,
       23, -1, -1, -1, -1,  0, -1,  5,  9, 18, 21,  0, 18, -1,  0, 21,  4,
       -1,  4, 19, 26, 26, 25, 21,  4, 10, -1, 24,  6,  4,  8,  0,  9,  5,
       10,  0,  8,  5, -1,  7,  0,  1, 25, 13,  1, -1, 25,  0, -1, 18, -1,
        0, 19,  0,  0, 10,  0, 16, -1, 14, 24, 18, 16, -1,  2, -1,  4, -1,
       20,  0, -1,  7,  0,  5, 25,  2, -1,  0, 24,  6, 10, 23, 15, -1, -1,
       -1,  2, -1, 10,  0, 26, -1, -1,  0,  3,  2, -1,  0, 12,  1,  3,  0,
       25, -1,  0,  0, 10, 22, -1, -1, -1, 26,  9, 10, -1, -1, -1,  0, -1,
        0, -1,  9, -1,  0,  0, 10, 11, 26, -1,  9,  9, 24, -1, 25,  2, -1,
        0, 19,  8, -1, -1, 24, -1, 20, 18, 18,  4, -1,  8,  0, 25,  2, 22,
       26, 17, 10, 15,  1, -1, 25,  1, -1, 25,  0, 25, 22, 10, -1,  0,  6,
        9, 18,  5, 26, -1,  5,  0, 14,  3, 10, -1,  0,  0, 18, 23, 25, 18,
        8,  4,  9,  0, 11, 12, -1,  6,  4,  9, -1, -1, 23, 25, -1,  8,  9,
       -1,  2, 13, 16,  0

In [10]:
pairwise_distances(clustering.pca.transform(embed), clustering.clusterer.centroids_)

NameError: name 'embed' is not defined

In [17]:
embed = vdb.embed([df.loc[df['id'] == data['ids'][1]]['Body'].values[0]])
# embed = vdb.embed([df.iloc[0]['Body']])
# embed = [data['embeddings'][0]]
clustering.predict(embed, threshold=10)

[[1.05367121e-08]] [[0]]


0

In [15]:
pairwise_distances(vdb.embed([df.loc[df['id'] == data['ids'][0]]['Body'].values[0]]), [data['embeddings'][0]])

array([[2.98023224e-08]])

In [6]:
collection_query = emails_collection.query(query_texts=["I love to eat tickets"], include=['embeddings'])

In [26]:
collection_query = emails_collection.query(query_texts=["trees"])
df.loc[df['id'] == collection_query['ids'][0][0]]

Unnamed: 0,Date,From,To,Subject,Body,id
39,"Wed, 04 Oct 2023 09:06:13","""The North Face"" <reply@e.thenorthface.com>",<alex.k.korte@gmail.com>,Explore new terrain with Summit Series,The North FaceGear built to reach your full po...,341bacd6-a51e-4170-9ef5-14923fc6ab21


In [31]:
len(emails_collection.get(include=['embeddings'])['ids'])

502