In [77]:
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
import pickle
import os.path
import base64
import email
from bs4 import BeautifulSoup
from datetime import datetime, timedelta, timezone

In [78]:
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']

In [80]:
def getEmails():
    # Variable creds will store the user access token.
    # If no valid token found, we will create one.
    creds = None

    # The file token.pickle contains the user access token.
    # Check if it exists
    if os.path.exists('token.pickle'):

        # Read the token from the file and store it in the variable creds
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)

    # If credentials are not available or are invalid, ask the user to log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)

        # Save the access token in token.pickle file for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    # Connect to the Gmail API
    service = build('gmail', 'v1', credentials=creds)

    # Request a list of all the messages
    # Calculate the timestamp for 24 hours ago
    now = datetime.now(timezone.utc)
    yesterday = now - timedelta(days=1)
    yesterday_timestamp = int(yesterday.timestamp())
    query = f'after:{yesterday_timestamp} (category:primary OR category:updates)'
    result = service.users().messages().list(userId='me', q=query).execute()
    messages = result.get('messages')

    # Check if messages are returned
    if not messages:
        print("No new messages found.")
        return

    # Messages is a list of dictionaries where each dictionary contains a message id.
    # Iterate through all the messages
    for msg in messages:
        # Get the message from its id
        txt = service.users().messages().get(userId='me', id=msg['id']).execute()
        # Use try-except to avoid any Errors
        try:
            # Get value of 'payload' from dictionary 'txt'
            payload = txt['payload']
            headers = payload['headers']

            # Look for Subject and Sender Email in the headers
            subject = sender = None
            for d in headers:
                if d['name'] == 'Subject':
                    subject = d['value']
                elif d['name'] == 'From':
                    sender = d['value']

            # The Body of the message is in Encrypted format. So, we have to decode it.
            # Get the data and decode it with base64 decoder.

            print(payload)

            if 'parts' in txt['payload']:
                for part in txt['payload']['parts']:
                    if part['mimeType'] == 'text/html':
                        data = part['body']['data']
                        data = data.replace("-", "+").replace("_", "/")
                        decoded_data = base64.b64decode(data)
                        soup = BeautifulSoup(decoded_data, "html.parser")

                        # Extract links and append to the text
                        for a in soup.find_all('a', href=True):
                            a_text = f"{a.text} ({a['href']})"
                            a.replace_with(a_text)

                        body = soup.get_text()

                    elif part['mimeType'] == 'text/plain':
                        data = part['body']['data']
                        data = data.replace("-", "+").replace("_", "/")
                        decoded_data = base64.b64decode(data)
                        body = decoded_data.decode('utf-8')
            else:
                data = payload['body'].get('data')
                if data:
                    data = data.replace("-", "+").replace("_", "/")
                    decoded_data = base64.b64decode(data)
                    body = decoded_data.decode('utf-8')

            # Generate a filename
            filename = f"emails/{sender}_{msg['id']}.txt"

            # Write email data to the file
            with open(filename, 'w', encoding='utf-8') as f:
                f.write(f"Subject: {subject}\n")
                f.write(f"From: {sender}\n")
                f.write("Message:\n")
                f.write(body)

        except Exception as e:
            print(f"An error occurred: {e}")
            pass

In [81]:
getEmails()

{'partId': '', 'mimeType': 'multipart/alternative', 'filename': '', 'headers': [{'name': 'Delivered-To', 'value': 'kishanseksaria@gmail.com'}, {'name': 'Received', 'value': 'by 2002:a05:7023:d0c:b0:89:e036:1a49 with SMTP id qs12csp127738dlb;        Sat, 7 Sep 2024 08:06:57 -0700 (PDT)'}, {'name': 'X-Google-Smtp-Source', 'value': 'AGHT+IFCABbdHAX71Mu7laciHnyFTeGDjOdo+Pqhp9gQR8i6qLwHPXOrJGhlS8x8LeOryc8vLVHl'}, {'name': 'X-Received', 'value': 'by 2002:ac8:5812:0:b0:456:7693:8407 with SMTP id d75a77b69052e-4580c7a5947mr84956661cf.56.1725721617249;        Sat, 07 Sep 2024 08:06:57 -0700 (PDT)'}, {'name': 'ARC-Seal', 'value': 'i=1; a=rsa-sha256; t=1725721617; cv=none;        d=google.com; s=arc-20240605;        b=XHiHasbYmTMa0/gAvAVtrtG9si7lqd2nEGUibZHEzcJIxtm4iCZItPFCb1k1GwURPP         W3BD2K8lGkIQbJZjC06sE/qsKrTSBmaodVoTDOHvTIOo6ftSNT6WTGSZY85NhSr3TsYl         J5XwTBy4BHc9bYQ6ozL4olbJLmoYIaiTMhmvzLQgdAH4zl0IaJ7zRqcIDUminbICK6EQ         8jAGAeV7iQ9QA+iXHSV/XoLwAGgcoMGg5ol7uJH8vvYmgEaplXDZWc

In [83]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()
pinecone_api_key = os.getenv("PINECONE_API_KEY")
openai.api_key = os.getenv("OPENAI_API_KEY")

In [84]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

# Prompt
prompt_text = """You are an assistant tasked with summarizing and classifying emails. 

First, summarize the email content concisely and only include the links present corresponding to jobs in the email. Then, classify the email as one of the following categories based on its content. Use the criteria provided for each category:

1. **Job Applied Confirmation**: Emails confirming that a job application has been successfully received or submitted. These emails often thank the applicant for their interest in the position and provide further instructions or timelines for the next steps.

2. **Job Rejection**: Emails that inform the recipient that their application has not been successful. These emails usually contain phrases like "we regret to inform you," "we have decided to move forward with other candidates," or similar language indicating a rejection.

3. **Job Offered**: Emails that offer a job to the recipient. They often include details about the job offer, such as position title, salary, benefits, and instructions for how to accept the offer.

4. **Completed Task Corresponding to a Job**: Emails that confirm the completion of a task related to a job application or interview process, such as the submission of a coding test, assessment, or any required documents. If an email mentions that a test has been submitted or a task is completed for a job application, classify it as this.

5. **New Job Notification**: Emails that notify the recipient about new job openings or opportunities. These might come from job boards, recruiters, or companies and often include descriptions of available positions.

6. **Job Task or To Do**: Emails that include tasks or action items related to a job, such as follow-up actions after an interview, requests for additional information, or other job-related tasks that need to be completed by the recipient.

7. **Not Job Specific Email**: Emails that do not pertain to a specific job application, job offer, or job-related task. These could be newsletters, general information, or any emails not directly related to a job process.

Email: {element}

Output:
Summary: [Provide the summary here]
Classification: [Provide the classification here]
"""
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatOpenAI(temperature=0, model="gpt-4o")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()
summarize_chain

{
  element: RunnableLambda(lambda x: x)
}
| ChatPromptTemplate(input_variables=['element'], messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['element'], template='You are an assistant tasked with summarizing and classifying emails. \n\nFirst, summarize the email content concisely and only include the links present corresponding to jobs in the email. Then, classify the email as one of the following categories based on its content. Use the criteria provided for each category:\n\n1. **Job Applied Confirmation**: Emails confirming that a job application has been successfully received or submitted. These emails often thank the applicant for their interest in the position and provide further instructions or timelines for the next steps.\n\n2. **Job Rejection**: Emails that inform the recipient that their application has not been successful. These emails usually contain phrases like "we regret to inform you," "we have decided to move forward with other candidates,"

In [85]:
import classification as const

def classify_email(email_content):
    prompt_text1 = f"""You are a job email classifier. I want you to classify this email as one of the following: {', '.join(const.Classifications)}, just return the classification values. Here is the email: {email_content}. Use the criteria provided for each category:

    1. **Job Applied Confirmation**: Emails confirming that a job application has been successfully received or submitted. These emails often thank the applicant for their interest in the position and provide further instructions or timelines for the next steps.

    2. **Job Rejection**: Emails that inform the recipient that their application has not been successful. These emails usually contain phrases like "we regret to inform you," "we have decided to move forward with other candidates," or similar language indicating a rejection.

    3. **Job Offered**: Emails that offer a job to the recipient. They often include details about the job offer, such as position title, salary, benefits, and instructions for how to accept the offer.

    4. **Completed Task Corresponding to a Job**: Emails that confirm the completion of a task related to a job application or interview process, such as the submission of a coding test, assessment, or any required documents. If an email mentions that a test has been submitted or a task is completed for a job application, classify it as this.

    5. **New Job Notification**: Emails that notify the recipient about new job openings or opportunities. These might come from job boards, recruiters, or companies and often include descriptions of available positions.

    6. **Job Task or To Do**: Emails that include tasks or action items related to a job, such as follow-up actions after an interview, requests for additional information, or other job-related tasks that need to be completed by the recipient.

    7. **Not Job Specific Email**: Emails that do not pertain to a specific job application, job offer, or job-related task. These could be newsletters, general information, or any emails not directly related to a job process."""
    
    response = model(prompt_text1)
    return response.content

texts=[]
email_summaries = []
email_classifications = []
for filename in os.listdir("emails"):
		if filename.endswith(".txt"):
			filepath = os.path.join("emails", filename)
			with open(filepath, 'r', encoding='utf-8') as file:
				email_content = file.read()
				# Get the summary
				texts.append(email_content)
				input_data = {"element": email_content}
				summary = summarize_chain.invoke(input_data)
				email_summaries.append(summary)
				classification = classify_email(email_content)
				email_classifications.append(classification)

				

In [86]:
email_summaries

['Summary: This email is a receipt from Lyft for a ride taken with driver Budi on September 6, 2024. It includes details about the ride, fare breakdown, and options to add a tip, favorite the driver, find a lost item, or request a review.\n\nClassification: Not Job Specific Email',
 'Summary: Gayatri Deokar added you to the group "Beach bitches" on Splitwise. View the group here: [https://www.splitwise.com/optional_applinks/groups/70369310?tr=qx9t2cy1nctq00b]\n\nClassification: Not Job Specific Email',
 'Summary: The email from Oracle Talent Acquisition instructs the recipient, Kishan, to confirm their identity using a one-time pass code (304704) that will expire in 10 minutes.\n\nClassification: Job Task or To Do',
 'Summary: The email informs recipients about a power outage affecting neighborhoods around the University Park Campus (UPC) while the campus itself remains operational. It provides safety instructions, emergency contact numbers, and advice on handling equipment and lab ref

In [87]:
import uuid
import pinecone
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain.storage import InMemoryStore
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_core.vectorstores import VectorStore, VectorStoreRetriever



import os
from pinecone import Pinecone, ServerlessSpec, PodSpec

pc = Pinecone(
    api_key= pinecone_api_key
)

# Create the index and populate it
om_index = 'om-test-1'
if om_index not in pc.list_indexes().names():
    pc.create_index(
        name=om_index,
        dimension=1536,
        metric='cosine',
        spec=ServerlessSpec(
          cloud = 'aws',
          region = "us-east-1")
    )

## Uploading the documents to the vector database
vectorstore = PineconeVectorStore(
    embedding=OpenAIEmbeddings(),
    index_name=om_index
)

doc_ids = [str(uuid.uuid4()) for _ in texts]
metadata = [{"classification": x} for x in email_classifications]
summary_texts = [
    Document(page_content=s, metadata={"doc_id": doc_ids[i], "classification": email_classifications[i]})
    for i, s in enumerate(email_summaries)
]

# Add documents to the vector store
vectorstore.add_documents(documents=summary_texts)

['6fec4d8d-66da-45ba-8662-88da6f11e36e',
 '3b521137-b91e-41da-a525-e1581f78bb3f',
 'ea67ae1f-280b-46f4-880d-dfb0d8ff7280',
 'ac2f016f-2400-44f2-8d30-97c14425fa1b',
 'dd9b7e3c-86d1-4da3-bdc7-e6a9ace335a9',
 '58b52e90-2b51-4a36-b29b-eb762228ac5f',
 '3a62ada4-5361-4a3a-8cda-1c72e414495c',
 '85ea9bc0-910f-4680-a48b-1571173d815f',
 'f0fccd34-54a8-4b8f-88e5-9e64d552c12f',
 '10031c1d-e4d9-4781-b3d3-e96f383ba4a5',
 'f3386c5a-18fc-4112-b597-d9785aa801d3',
 '286a0b79-9e9f-4922-8455-82c7e3c9ef7f',
 'ca0c1ed0-9344-4b9e-bf72-c9e86fc534df',
 '6cf700bd-6a36-48ba-8ce5-e88b5b4f9d36']

In [88]:
retriever = vectorstore.as_retriever(search_kwargs={"k": 15, "filter": {"classification": "Job Task or To Do"}})

In [89]:
def retrieve_context_for_classification(classification):
    retriever = vectorstore.as_retriever(
        search_kwargs={"k": 100, "filter": {"classification": classification}}
    )
    retrieved_docs = retriever.get_relevant_documents(classification)
    context = "\n".join([doc.page_content for doc in retrieved_docs])
    return context


from langchain_core.runnables import RunnablePassthrough
from operator import itemgetter

# Prompt template
template = """Answer the question based only on the following context, which include email contents:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

# LLM
model = ChatOpenAI(temperature=0, model="gpt-4o")

# RAG pipeline
chain = (
    {
        "context": lambda inputs: retrieve_context_for_classification(inputs["classification"]),
        "question": RunnablePassthrough()
    }
    | prompt
    | model
    | StrOutputParser()
)

In [90]:
print(chain.invoke({
    "classification": "New Job Notification",
    "question": "You are a email monitor agent, and you need to make a list of all the job roles with the company name and if same job is found in multiple emails just put it once in the list."
}))

Based on the provided context, here is a consolidated list of job roles with their respective company names, ensuring that each job is listed only once:

1. **SNOWFLAKE**
   - Software Engineer

2. **Trade Desk**
   - Data Scientist I-Value Algorithms

3. **Michaels Arts and Crafts**
   - Software Developer III
   - Software Developer

4. **Cboe**
   - Software Engineer Intern
   - Software Engineer Intern in Kansas
   - Linux Engineer
   - Senior Software Engineer - NYC
   - Senior Software Engineer (C++) - Lenexa, KS
   - Web QA Software Engineer Intern

5. **Vanguard**
   - Application Engineering Technical Lead II
   - Technical Product Owner, Workplace Services SRE

6. **Russell Tobin**
   - SQL Data Analyst
   - Data Analyst
   - Applications Developer 1 (Software Developer)

7. **Dexian - Signature Consultants**
   - Data Analyst

8. **Scale AI**
   - Machine Learning Research Engineering Intern (Summer 2025)

9. **AvidXchange, Inc.**
   - 2025 Undergraduate Summer Intern: Softw

In [91]:
classification_questions = {
    "Job Applied Confirmation": "You are an email monitor agent, and you need to make a list of all companies where a job application has been confirmed in the format [Company Name][Job Title] and if the Job Title is not mentioned put SWE role.",
    "Job Rejection": "You are an email monitor agent, and you need to make a list of all companies that have sent job rejection emails in the format [Company Name][Job Title] and if the Job Title is not mentioned put SWE role.",
    "Job Offered": "You are an email monitor agent, and you need to make a list of all companies that have a sent a job offer email in the format [Company Name][Job Title] and if the Job Title is not mentioned put SWE role.",
    "Completed Task Corresponding to a Job": "You are an email monitor agent, and you need to make a list of all completed tasks corresponding to a job application emails in the format [Company Name][Job Title] and if the Job Title is not mentioned put SWE role.",
    "New Job Notification": "You are an email monitor agent, and you need to make a list of all new job notifications received in the format [Company Name][Job Title][Link] and if the same job is found in multiple emails just put it once in the list.",
    "Job Task or To Do": "You are an email monitor agent, and you need to make a list of all tasks or to-dos related to job applications emails in the format [Company Name][Job Title] and if the Job Title is not mentioned put SWE role.",
    "Not Job Specific Email": "List all emails not specific to any job application."
}

In [92]:
results = {}
for classification, question in classification_questions.items():
    print(f"Processing classification: {classification}")
    context = retrieve_context_for_classification(classification)
    if context.strip():  # Check if context is not empty
        result = chain.invoke({
            "classification": classification,
            "question": question
        })
    else:
        result = "No emails associated with this classification category."
    results[classification] = result
    print(f"Result for {classification}: {result}\n")

# Output all results
for classification, result in results.items():
    print(f"Classification: {classification}\nResult: {result}\n")

Processing classification: Job Applied Confirmation
Result for Job Applied Confirmation: - [Oracle][Software Engineer - NetSuite]

Processing classification: Job Rejection
Result for Job Rejection: No emails associated with this classification category.

Processing classification: Job Offered
Result for Job Offered: No emails associated with this classification category.

Processing classification: Completed Task Corresponding to a Job
Result for Completed Task Corresponding to a Job: No emails associated with this classification category.

Processing classification: New Job Notification
Result for New Job Notification: Based on the provided context, here is the list of all new job notifications received:

1. [SNOWFLAKE][Software Engineer][https://substack.com/redirect/7521321d-edc3-4931-a2e6-6ec1a559df17?j=eyJ1IjoiNGFkZGQzIn0.d0munqD0mxOYfs1httGXqT_Ki9jmMmWVuFlFmgEm9k0]
2. [Trade Desk][Data Scientist I-Value Algorithms][https://substack.com/redirect/8beedadc-663b-4b16-bad2-307abaa1172

In [None]:
#TODO: Delete all the email files from emails folder after the job is done
#TODO: Integrate Twilio
#TODO: Filter out the not job specific emails and do not send them to the database
#TODO: Fix classification for the emails/ add priority to the classifications
#TODO: Segregate the summarize and classification prompts and functions
#TODO: Run every 24 hours using a cron job/kafka
#TODO: Update tags based on existing companies and roles
#TODO: Optimise the code
#TODO: Try to add the data to a spreadsheet, and then use the spreadsheet as the source for the data
#TODO: Use twilio to send whatsapp notifications based on the classification