### Adding imports

In [273]:
import os
from bs4 import BeautifulSoup
from supabase import create_client, Client
import mailbox
import quopri
from email.header import decode_header
from email.utils import parsedate_to_datetime
from dotenv import load_dotenv
import pinecone
import pandas as pd
from langchain.document_loaders import JSONLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone
from langchain.text_splitter import CharacterTextSplitter

### Code to read emails from the mailbox

In [49]:

# Function to decode email headers
def decode_email_header(header_value):
    decoded_headers = decode_header(header_value)
    decoded_parts = []
    for part, encoding in decoded_headers:
        if encoding is None:
            if isinstance(part, bytes):
                part = part.decode('utf-8', 'ignore')
            decoded_parts.append(part)
        else:
            part = quopri.decodestring(part).decode(encoding, 'ignore')
            decoded_parts.append(part)
    return ' '.join(decoded_parts)

# Path to the .mbox file
mbox_file_path = 'emails/All mail Including Spam and Trash.mbox'

# Create an mbox mailbox object
mbox = mailbox.mbox(mbox_file_path)

# List to store extracted information
# extracted_data = []

# # Iterate through each email message in the mbox file
# for message in mbox:
#     try:
#         email_data = {}

#         # Extract 'To' field
#         email_data['To'] = decode_email_header(message['To'])

#         # Extract 'From' field
#         email_data['From'] = decode_email_header(message['From'])

#         # Extract 'Subject' field
#         email_data['Subject'] = decode_email_header(message['Subject'])

#         # Extract 'Date' field and convert it to a datetime object
#         email_data['Date'] = parsedate_to_datetime(message['Date'])

#         # Extract email content (body)
#         email_data['Content'] = message.get_payload(decode=True).decode('utf-8', 'ignore')

#         # Append the extracted data to the list
#         extracted_data.append(email_data)
#     except:
#         pass

# # Print the extracted data for the first email (you can loop through the list for all emails)
# if extracted_data:
#     first_email = extracted_data[0]
#     print("To:", first_email['To'])
#     print("From:", first_email['From'])
#     print("Subject:", first_email['Subject'])
#     print("Date:", first_email['Date'])
#     print("Content:", first_email['Content'])


### Reading HTML data

In [50]:
# for e in extracted_data:
#     # Parse the HTML content using BeautifulSoup
#     soup = BeautifulSoup(e['Content'], 'html.parser')

#     # Extract text content from the HTML
#     text_content = soup.get_text().replace('\n',' ')

#     # Print the extracted text content
#     e['Content'] = text_content

### Pushing data to supabase

In [135]:
url: str = os.environ.get("SUPABASE_URL")
key: str = os.environ.get("SUPABASE_KEY")
supabase: Client = create_client(url, key)

In [136]:
ids = []

In [137]:
extracted_data[0]

{'To': 'saarth.shah28@gmail.com',
 'From': 'Webull <statements@mail.webull.us>',
 'Subject': 'Your eStatement is ready to view (2023/08)',
 'Date': datetime.datetime(2023, 9, 6, 16, 42, 17, tzinfo=datetime.timezone.utc),
 'Content': 'Dear SAARTH SHAH,A new statement is available online for account ####E1F6.To view this document, click the link and log into your eDocuments account at www.webull.com/edocs.We recommend you review the documents for accuracy and encourage you to contact us at customerservice@webull-us.com if you have any questions.We appreciate you electing to receive account statements electronically.Regards,Webull Financial LLCcustomerservice@webull-us.comwww.webull.com/edocsAdobe Acrobat Reader is required to view eDocuments, free download http://www.adobe.com/products/acrobat/readstep2.html.Please do not respond to this email. This e-mail system does not have the ability to respond to individual questions or comments.Webull FinancialPrivacy NoticeDisclosuresWebull Techn

In [138]:
extracted_data[0]['Date'].isoformat()

'2023-09-06T16:42:17+00:00'

In [139]:
# for e in extracted_data:
#     data, count = supabase.table('emails').insert({"to": e['To'], "from": e['From'], "subject":e['Subject'],
#         'date':e['Date'].isoformat(), 'content':e['Content']}).execute()

In [140]:
response = supabase.table('emails').select("*").execute()

2023-09-21 19:55:39,875:INFO - HTTP Request: GET https://ydcngzrcwfmvjnuawgjn.supabase.co/rest/v1/emails?select=%2A "HTTP/1.1 200 OK"


In [141]:
response.data[0]

{'id': 9,
 'created_at': '2023-09-11T07:55:42.643213+00:00',
 'subject': 'Your eStatement is ready to view (2023/08)',
 'to': 'saarth.shah28@gmail.com',
 'from': 'Webull <statements@mail.webull.us>',
 'date': '2023-09-06',
 'content': 'Dear SAARTH SHAH,A new statement is available online for account ####E1F6.To view this document, click the link and log into your eDocuments account at www.webull.com/edocs.We recommend you review the documents for accuracy and encourage you to contact us at customerservice@webull-us.com if you have any questions.We appreciate you electing to receive account statements electronically.Regards,Webull Financial LLCcustomerservice@webull-us.comwww.webull.com/edocsAdobe Acrobat Reader is required to view eDocuments, free download http://www.adobe.com/products/acrobat/readstep2.html.Please do not respond to this email. This e-mail system does not have the ability to respond to individual questions or comments.Webull FinancialPrivacy NoticeDisclosuresWebull Tec

### Pushing data in pinecone

In [142]:
# initialize pinecone
pinecone.init(
    api_key=os.getenv("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.getenv("PINECONE_ENV"),  # next to api key in console
)
index_name = "emails"
# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
      name=index_name,
      metric='cosine',
      dimension=1536  
)

In [143]:
df = pd.DataFrame(response.data)

In [244]:
df.to_json('emails/emails.json',orient='records', lines=True)

In [232]:
jq_schema = {
    'id': '.id',
    'created_at': '.created_at',
    'subject': '.subject',
    'to': '.to',
    'from': '.from',
    'date': '.date',
    'content': '.content'
}

In [233]:
str(jq_schema)

"{'id': '.id', 'created_at': '.created_at', 'subject': '.subject', 'to': '.to', 'from': '.from', 'date': '.date', 'content': '.content'}"

In [245]:
one_email = df
# df.iloc[[0]].to_json('emails/emails.json',orient='records', lines=True)

In [246]:
def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["subject"] = record.get("subject")
    metadata["to"] = record.get("to")
    metadata["from"] = record.get("from")
    metadata["date"] = record.get("date")
    metadata["created_at"] = record.get("created_at")
    metadata["id"] = record.get("id")
    return metadata

In [247]:
loader = JSONLoader(
    file_path='./emails/emails.json', jq_schema='.', content_key = 'content',
metadata_func=metadata_func,json_lines=True,text_content = False)

data = loader.load()

In [249]:
len(data)

2593

d

In [252]:
data[0].page_content

'Dear SAARTH SHAH,A new statement is available online for account ####E1F6.To view this document, click the link and log into your eDocuments account at www.webull.com/edocs.We recommend you review the documents for accuracy and encourage you to contact us at customerservice@webull-us.com if you have any questions.We appreciate you electing to receive account statements electronically.Regards,Webull Financial LLCcustomerservice@webull-us.comwww.webull.com/edocsAdobe Acrobat Reader is required to view eDocuments, free download http://www.adobe.com/products/acrobat/readstep2.html.Please do not respond to this email. This e-mail system does not have the ability to respond to individual questions or comments.Webull FinancialPrivacy NoticeDisclosuresWebull TechnologiesTerms of ServicePrivacy PolicyData Disclaimer   Webull StoryPricingHelp CenterContact us    Please Login Webull app - Menu - Help Center - Contact Us\r    Financial products and services are offered to self-directed investors 

In [279]:
data[805].page_content

'Dear  SAARTH,Following is/are the list of transactions for your Demat account ending with *02831411Sr. No.Company NameISINQuantityDebit / CreditDate and Time1ADANI POWER LIMITED - EQUITY SHARESINE814H01011      100.000Credit25/08/2022 13:19:24For further queries, please contact your Depository Participant [DP] with whom you hold your demat account.Yours Truly,Central Depository Services (India) LtdDisclaimer: The information contained herein is confidential and is intended solely for the addressee(s). If you have erroneously received this message, please immediately delete it and notify at helpdesk@cdslindia.com. You must not directly or indirectly, use, disclose, distribute, print, or copy any part of this message.Note: This is an electronic message. Please do not reply to this email.'

In [295]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

In [296]:
docs = text_splitter.split_documents(data)

In [297]:
len(docs)

2593

In [310]:
docs[0].page_content

'Dear SAARTH SHAH,A new statement is available online for account ####E1F6.To view this document, click the link and log into your eDocuments account at www.webull.com/edocs.We recommend you review the documents for accuracy and encourage you to contact us at customerservice@webull-us.com if you have any questions.We appreciate you electing to receive account statements electronically.Regards,Webull Financial LLCcustomerservice@webull-us.comwww.webull.com/edocsAdobe Acrobat Reader is required to view eDocuments, free download http://www.adobe.com/products/acrobat/readstep2.html.Please do not respond to this email. This e-mail system does not have the ability to respond to individual questions or comments.Webull FinancialPrivacy NoticeDisclosuresWebull TechnologiesTerms of ServicePrivacy PolicyData Disclaimer   Webull StoryPricingHelp CenterContact us    Please Login Webull app - Menu - Help Center - Contact Us\r    Financial products and services are offered to self-directed investors 

In [311]:
data[0].metadata

{'source': '/Users/saarth/Desktop/Dev/LLM Practice/emails/emails.json',
 'seq_num': 1,
 'subject': 'Your eStatement is ready to view (2023/08)',
 'to': 'saarth.shah28@gmail.com',
 'from': 'Webull <statements@mail.webull.us>',
 'date': '2023-09-06',
 'created_at': '2023-09-11T07:55:42.643213+00:00',
 'id': 9}

## Pushing data to pinecone

In [312]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'])

In [318]:
embeddings.embed_query(docs[0].page_content)

[-0.019593092925425614,
 -0.007313299227494073,
 -0.02537823947086823,
 -0.03293713450712835,
 -0.016564076243219632,
 0.015226943868785446,
 -0.044616582325601326,
 -0.02086200499210128,
 0.007074525356371618,
 -0.017628326455297192,
 0.02379551082200672,
 -0.0007291127464491108,
 -0.0012510038982846319,
 0.02050725492140876,
 -0.0025071244826325983,
 0.01697340367770602,
 0.019975130746692524,
 -0.02608773961225327,
 -0.01403989691647804,
 0.012348014471351334,
 -0.006197884483089531,
 0.015267876425969577,
 0.001920423293368447,
 0.022472021391537183,
 -0.014913127286599602,
 0.0062081178552162,
 0.008398013855518608,
 -0.019279276343239768,
 -0.01438100218056082,
 -0.023372539512233132,
 -0.021762523112797232,
 -0.004628799709515214,
 -0.02758860314674652,
 -0.01473575225125334,
 -0.0210257352208378,
 0.008705009431383401,
 0.013541883361302338,
 -0.01665958709552018,
 0.02578756690535462,
 -0.007504318138127528,
 0.019620380676000006,
 0.01941571695875681,
 0.012423057648075997,
 

In [324]:
docs[0]

Document(page_content='Dear SAARTH SHAH,A new statement is available online for account ####E1F6.To view this document, click the link and log into your eDocuments account at www.webull.com/edocs.We recommend you review the documents for accuracy and encourage you to contact us at customerservice@webull-us.com if you have any questions.We appreciate you electing to receive account statements electronically.Regards,Webull Financial LLCcustomerservice@webull-us.comwww.webull.com/edocsAdobe Acrobat Reader is required to view eDocuments, free download http://www.adobe.com/products/acrobat/readstep2.html.Please do not respond to this email. This e-mail system does not have the ability to respond to individual questions or comments.Webull FinancialPrivacy NoticeDisclosuresWebull TechnologiesTerms of ServicePrivacy PolicyData Disclaimer   Webull StoryPricingHelp CenterContact us    Please Login Webull app - Menu - Help Center - Contact Us\r    Financial products and services are offered to se

In [345]:
# vectors = []

# for d in docs:
#     vectors.append({
#         'id':d.metadata['subject'][:20]+d.metadata['created_at'],
#         'values':embeddings.embed_query(d.page_content),
#         'metadata':d.metadata
#     })

In [346]:
len(vectors)

2593

In [347]:
index = pinecone.Index('emails')

In [338]:
upsert_response

{'upserted_count': 10}

In [358]:
import re
def remove_non_ascii(text):
    # Use a regular expression to remove non-ASCII characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text

In [359]:
for v in vectors:
    v['id'] = remove_non_ascii(v['id'])

In [361]:
# for i in range(0,len(vectors),100):
#     print(i,i+100)
#     upsert_response = index.upsert(vectors=vectors[i:i+100])
#     print(upsert_response)

In [351]:
len(vectors)

2593

In [343]:
# delete_response = index.delete(ids=[v['id'] for v in vectors])

In [344]:
# delete_response

{}