In [27]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from mailfox.email_interface import EmailHandler
from mailfox.email_interface import EmailLLM
from mailfox.vector import VectorDatabase
from mailfox.vector import FolderCluster

In [28]:
# Read credentials from txt file
with open('credentials.txt') as f:
    lines = f.readlines()
    username = lines[0].strip()
    password = lines[1].strip()

email_handler = EmailHandler(username, password)

In [29]:
# Notifications, Newsletters, Finance, Jobs
flagged_folders = ['Notifications', 'Newsletters', 'Finance', 'Jobs']
folders = email_handler.get_all_folders()
flagged_folders_and_subfolders = [folder for folder in folders if any(flagged_folder in folder for flagged_folder in flagged_folders)]

In [30]:
email_handler.get_subfolders(flagged_folders)

['"Finance"',
 '"Finance/Bills"',
 '"Finance/Credit Cards"',
 '"Finance/Purchases"',
 '"Finance/Statements"',
 '"Jobs"',
 '"Newsletters"',
 '"Notifications"',
 '"Notifications/Messages"',
 '"Notifications/Research"',
 '"Notifications/Safety"',
 '"Notifications/Security"',
 '"Notifications/Shopping"',
 '"Notifications/Shopping/Sales"',
 '"Notifications/Shopping/Shipping"',
 '"Notifications/Social"',
 '"Notifications/Support"',
 '"Notifications/Surveys"']

In [31]:
save_path = "email_db"
db_path = os.path.join(save_path, "chroma_db")
clustering_path = os.path.join(save_path, "clustering.pkl")
vector_db = VectorDatabase(db_path)

In [32]:
if not os.path.exists('all_mail.pkl'):
    print("Fetchin all Mail")
    # all_mail = email_handler.get_mail(filter='unseen', return_dataframe=True)
    # uids = email_handler.get_all_mail_uids()
    # all_mail = email_handler.get_mail(filter='uids', uids=uids[-5000:], return_dataframe=True)
    
    all_mail = email_handler.get_mail(filter='all', folders=flagged_folders_and_subfolders, return_dataframe=True)
    all_mail.to_pickle('all_mail.pkl')
else:
    all_mail = pd.read_pickle('all_mail.pkl')

In [33]:
all_mail.head()

Unnamed: 0,uid,folder,uuid,from,to,subject,date,body,raw_body
0,1,"""Finance""",bef768d82e7b75cc4cd315883a36273943a7bb285fcda9...,Quicken Simplifi <noreply@quicken.com>,alex.k.korte@gmail.com,b'\xf0\x9f\x94\x94',"Wed, 15 May 2024 13:51:50",Categorize your transactions to keep your bill...,Categorize your transactions to keep your bill...
1,1,"""Finance/Bills""",7f871015d78593a12d327b9cdf3f688ff0751129a67bfe...,Conservice <service@conservicemail.com>,"alex.k.korte@gmail.com, jeffrey.korte@gmail.com",Utility Billing Frequently Asked Questions dur...,"Wed, 15 May 2024 12:17:13",\n\n\n\n\n\n\n\n \n \n\n\n \nDear Valued Resid...,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.01 T..."
2,2,"""Finance/Bills""",9f6d8f3e185bbdc6811177ed84e2f818aaa93ec098598b...,Britni Purcell CIS Abroad <bpurcell@cisabroad....,"""alex.k.korte@gmail.com"" <alex.k.korte@gmail.com>",6 DAYS: Final Payment Deadline for May 5th,"Tue, 30 Apr 2024 08:17:19","Hello Alex,\r\n\r\nThis is a friendly reminder...","Hello Alex,\r\n\r\nThis is a friendly reminder..."
3,1,"""Finance/Credit Cards""",291bdd42f4acf7242bd453619e16ef7a7aa020ffa920a2...,Chase <no.reply.alerts@chase.com>,alex.k.korte@gmail.com,You're now sharing data with Quicken,"Sun, 19 May 2024 22:15:48",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,"\r\n<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1..."
4,2,"""Finance/Credit Cards""",cbeeeb6c259fd1bad68815cd7ea2fb72a6c3f08f833b8b...,Chase <no.reply.alerts@chase.com>,alex.k.korte@gmail.com,Your credit card payment is scheduled,"Sun, 19 May 2024 22:12:11",\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nPayment sche...,"<html xmlns=""http://www.w3.org/1999/xhtml"" lan..."


In [7]:
for idx, mail in tqdm(all_mail.iterrows(), desc="Generating Database", total=all_mail.shape[0]):
    embedding = vector_db.embed_email(mail)
    
    vector_db.emails_collection.add(
        ids=[mail['uuid']],
        embeddings=embedding,
        metadatas=dict(mail.drop('uuid'))
    )

Generating Database:   0%|          | 0/221 [00:00<?, ?it/s]

In [39]:
folder_vectors = {}
for folder in flagged_folders_and_subfolders:
    folder_vectors[folder] = np.array(vector_db.emails_collection.get(where={'folder': folder}, include=['embeddings'])['embeddings'])

In [40]:
folder_vectors["\"Finance/Bills\""]

array([[ 6.59638457e-03,  2.97274813e-03,  5.54245897e-04,
        -2.67004296e-02,  1.44617679e-02, -6.16418011e-02,
         1.52548319e-02, -8.20252486e-03,  3.30192223e-02,
         2.54192147e-02, -2.49492761e-04,  4.57124971e-03,
         1.48062790e-02, -1.53143099e-02, -1.31618213e-02,
         9.03726742e-03,  2.80711725e-02, -3.51677872e-02,
        -2.74632759e-02,  2.18292288e-02,  2.16355901e-02,
         2.49656122e-02, -7.97913149e-02,  2.51616240e-02,
         4.40478660e-02,  4.32761908e-02, -1.16827041e-02,
        -1.36506753e-02, -6.50859624e-02, -1.67003665e-02,
         1.67975889e-03, -1.08252345e-02,  7.23657012e-03,
        -7.11737946e-02,  5.17301038e-02,  5.38936183e-02,
         1.92155894e-02,  1.02559878e-02,  6.19021058e-02,
         4.98512900e-03, -1.16240131e-02, -7.72179058e-03,
         3.35731730e-02, -1.18435966e-02, -1.99830569e-02,
         3.44166346e-02, -2.62247343e-02, -6.60912972e-03,
        -5.56213362e-03,  5.13496771e-02,  4.62687295e-0

In [44]:
centroids = {folder: np.mean(vectors, axis=0) for folder, vectors in folder_vectors.items()}
len(centroids), len(folder_vectors.keys())

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


(18, 18)

In [58]:
for i in list(centroids.values()):
    print(type(i))

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float64'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
<class 'numpy.float64'>


In [54]:
from sklearn.neighbors import KNeighborsClassifier, NearestNeighbors
nbrs = KNeighborsClassifier(n_neighbors=3, weights='distance').fit(np.array(centroids.values()), range(len(folder_vectors.keys())))

TypeError: float() argument must be a string or a real number, not 'dict_values'

In [41]:
clustering = FolderCluster(folder_vectors)
clustering.save_model(clustering_path)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (18,) + inhomogeneous part.

In [10]:
if not os.path.exists('new_mail.pkl'):
    new_mail = email_handler.get_mail(filter='unseen', return_dataframe=True)
    new_mail.to_pickle('new_mail.pkl')
else:
    new_mail = pd.read_pickle('new_mail.pkl')

In [11]:
import random

# Get a random email from new_mail
random_email = new_mail.sample(n=1)
random_email = random_email.to_dict(orient='records')[0]

embedding = vector_db.embed_email(random_email)
predicted_folder = clustering.predict(embedding)

# Print the email details and predicted folder
print(f"Predicted Folder: {predicted_folder}")
print(f"Email Details:")
for key, value in random_email.items():
    print(f"{key}: {value}")

Predicted Folder: ['"Newsletters"']
Email Details:
uid: 34288
folder: INBOX
uuid: ac9cb39b576da020ac2af9d67f204e0330f2cd92195065685e66be58acd577bf
from: Medium Daily Digest <noreply@medium.com>
to: alex.k.korte@gmail.com
subject: Quantum Mechanics Meets PCA: An (Un)expected Convergence | Rodrigo
 Silva in Towards Data Science
date: Sat, 25 May 2024 05:20:00
body: Stories for Alex K
@alex.k.korte (https://medium.com/@alex.k.korte?source=email-6e8d4e436d47-1716624990035-digest.reader-------------------------d476d917_ea85_4e87_a0d8_f8a323938ac7)
Â·Member (https://medium.com/@alex.k.korte?source=email-6e8d4e436d47-1716624990035-digest.reader-------------------------d476d917_ea85_4e87_a0d8_f8a323938ac7)

Today's highlights

Rodrigo Silva (https://medium.com/@rodrigopesilva?source=email-6e8d4e436d47-1716624990035-digest.reader-7f60cf5620c9-5e04bcb16376----0-102------------------d476d917_ea85_4e87_a0d8_f8a323938ac7-1)
 in Towards Data Science (https://medium.com/towards-data-science?source=em

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
emailLLM = EmailLLM('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')

In [24]:
(0.5/1000000) * (750/1000) * len(random_email['body']) * 300

0.8952749999999999

In [20]:
random_email = new_mail.sample(n=1)
random_email = random_email.to_dict(orient='records')[0]

predicted_folder = emailLLM.predict_folder(random_email, flagged_folders_and_subfolders)

# Print the email details and predicted folder
print(f"Predicted Folder: {predicted_folder}")
print(f"Email Details:")
for key, value in random_email.items():
    if key == 'raw_body':
        continue
    print(f"{key}: {value}")

Predicted Folder: "Notifications/Shopping"
Email Details:
uid: 34148
folder: INBOX
uuid: df05abb41f2b139058ef616a822d8b7aff8a5e0f927660b55965b4e22bd0559e
from: "The North Face" <reply@e.thenorthface.com>
to: <alex.k.korte@gmail.com>
subject: LIGHTRANGE: Lightweight, breathable sun protection.
date: Tue, 21 May 2024 07:15:14
body: The North FaceMeet our newest hiking gear designed to keep you out there
 
  ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌
 ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌




 
 
https://click.e.thenorthface.com/?qs=5f793a48910814f8ab22d1ffc6bb451bb5b2e7fb63e1d10b346b6053b59d6cdee85590591e28f6402588ff227da7f9d2de8c6547744247c7a505c207e460b88f 


https://click.e.thenorthface.com/?qs=5f793a48910814f8ea81d430

In [14]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(random_email['body'], 'lxml')
soup.get_text()

"EcoFlow's Survey\n\n\nWe'd love to hear what you think of our customer service.\nPlease take a moment to answer some simple questions by clicking\nthe link below:\n\n\n\nSTART SURVEY\n\n\n\r\nÂ\xa0\r\n\r\nOr copy paste the URL below into your internet browser.\nhttps://nam.dcv.ms/eRHduBKLuQ\n\n\nThis is a system generated email, please do not reply to it. The survey link in this email is unique to its recipient. Please do not forward this email. If you would like to unsubscribe and stop receiving these emails, click unsubscribe.\n\n\n"

In [15]:
random_email['body']

"\n\n\nEcoFlow's Survey\n\n\nWe'd love to hear what you think of our customer service.\nPlease take a moment to answer some simple questions by clicking\nthe link below:\n\n\n\nSTART SURVEY\n\n\n\r\nÂ\xa0\r\n\r\nOr copy paste the URL below into your internet browser.\nhttps://nam.dcv.ms/eRHduBKLuQ\n\n\nThis is a system generated email, please do not reply to it. The survey link in this email is unique to its recipient. Please do not forward this email. If you would like to unsubscribe and stop receiving these emails, click unsubscribe.\n\n\n"

In [18]:
email_handler.move_mail([random_email['uid']], '"Finance/Bills"')

('OK', [None])
Moved 34136 to "Finance/Bills"
