In [214]:
import mailparser
from bs4 import BeautifulSoup
from pprint import pprint
import os
from dotenv import load_dotenv
import re

'''
Description: 
I get too many emails. I just want to see headers that seem the most relevant to me and are less likely to be spam. I'll go check if out if I care. 

TODO: 
- Parse out headings from each email
- Rank them by importance/relevance/likelihood to be an ad
- Create output email with headings + which email it came from
- Pull emails from sender
- Send email to me
'''

load_dotenv()
openapi_key = os.environ['OPENAI_KEY']
sender_email = os.environ['SENDER_EMAIL']
sender_password = os.environ['SENDER_PASSWORD']

# Exploring Email

In [149]:
class MyEmail:

    def __init__(self, email): 
        self.email = email
        self.sender = re.findall('^(.*)<', email.headers['From'])[0].strip()
        self.soup = BeautifulSoup(email.text_html[0], 'html.parser')

        self.headings_list = [f"{e.name}:{e.text.strip()}" for e in self.soup.findAll(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'a'])]
        self.headings_text = "\n".join(self.headings_list)
        self.email_text = re.sub(r'\n+', '\n', str(self.soup.text))

    # TODO: Langchain prompt to get top headlines
    def get_top_headlines(self): 
        pass

    def set_headlines(self, headlines): 
        self.headlines = headlines

In [150]:
emails = []
for email_filename in os.listdir('./emails'): 
  with open(f'./emails/{email_filename}', 'rb') as f:
    email_str = f.read()
  mail = mailparser.parse_from_string(email_str.decode())
  my_email = MyEmail(mail)
  emails.append(my_email)


# Prompt Engineering

1. Try just pulling out the important headings from the h tags. Then pull out text and summarize each section
2. Try summarizing the whole document 
3. Follow up prompt to pull out the most interesting topics for a persona

NOTES: 
- Uses subheadings as headings -> can't distinguish nested structure from tags


In [155]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import OpenAI

davinci = OpenAI(model_name='text-davinci-003', openai_api_key=openapi_key, temperature=0.6)


In [156]:
# Pull out the newsletter headlines 

template = """Question: Retrieve the 5 most important and intriguing article headlines from the following newsletter snippets. Avoid headlines for ads or promotions. Ignore headlines that are very vague and uninformative. If no valid headlines are present, truthfully answer no valid headlines found.

{newsletter_headings}

Answer: """
prompt = PromptTemplate(
    template=template,
    input_variables=['newsletter_headings']
)

In [157]:
llm_chain = LLMChain(
    prompt=prompt,
    llm=davinci
)

In [None]:
# for e in emails: 
# DEV: Not able to run this in a loop?
e = emails[5]
print(f"Generating headlines for {e.sender}...")
headlines = llm_chain.run(e.headings_text)
print(headlines)
with open(f'./outputs/v2_headings/{e.sender}.txt', 'w') as f: 
    f.write(headlines)
e.set_headlines(headlines)


# Construct Email Body

Just make it text only for now

In [207]:
response_body = ""

for filename in os.listdir('./outputs/v2_headings/'): 
    with open(f'./outputs/v2_headings/{filename}', 'r') as f: 
        text = f.read()
        if "No valid" in text: 
            continue

        response_body += os.path.splitext(filename)[0] + "\n" +  text.strip() + "\n\n" 

with open('summary/v1/response.txt', 'w') as f: 
    f.write(response_body)

pprint(response_body)

('Morning\n'
 '1. Stocks are down—this is up\n'
 '2. Tour de headlines\n'
 '3. Senators grill Norfolk Southern CEO over derailments\n'
 '4. Spotify wants some scroll capital\n'
 '5. Key performance indicators\n'
 '\n'
 'ByteByteGo\n'
 '1. From 0 to Millions: A Guide to Scaling Your App - Final Part\n'
 '2. Scaling Modern Startup Stack\n'
 '3. Monitoring and Observability\n'
 '4. Scaling the Serverless Database Tier\n'
 '5. Database Sharding\n'
 '\n'
 'TLDR\n'
 '1. Spotify’s new design is part TikTok, part Instagram, and part YouTube (3 '
 'minute read)\n'
 '2. Google’s PaLM-E is a generalist robot brain that takes commands (4 minute '
 'read)\n'
 '3. This geothermal startup showed its wells can be used like a giant '
 'underground battery (11 minute read)\n'
 '4. Researchers develop blood test for anxiety (3 minute read)\n'
 '5. AI Looks Like a Bubble (15 minute read)\n'
 '\n'
 'Tech\n'
 '1. Does the 5G Rollout Mean 3G Shutdowns? \n'
 '2. Companies Ordered More Robots in 2022 \n'
 '3. 

# Send basic email

In [218]:
from send_message import gmail_send_message
gmail_send_message("Newsletter Digest :)", "handarishub@gmail.com", response_body)


Message Id: 186d278c38df7daf


{'id': '186d278c38df7daf',
 'threadId': '186d278c38df7daf',
 'labelIds': ['SENT']}