In [2]:
from google import genai
from google.genai import types
from typing import List
import sys
from dotenv import load_dotenv
import os
import json
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

In [3]:
load_dotenv()

def setup_client():
    """
    setting up the client for google genai
    """
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    if not GOOGLE_API_KEY:
        print("GOOGLE_API_KEY is not set. Please set it in your environment.")
        sys.exit(1)
    client = genai.Client(api_key=GOOGLE_API_KEY)
    return client

In [4]:
client = setup_client()

In [5]:
def call_llm(client, content, instruction, model="gemini-2.0-flash"):
    response = client.models.generate_content(
        model=model,
        contents=[content], # here should the content of email be
        config=types.GenerateContentConfig(
            max_output_tokens=1024,
            temperature=0.1,
            system_instruction= instruction,
        )
    )
    return response.text

In [None]:

def keyword_preprocessing(keywords: str) -> List[str]:
    """
    Preprocess the user-defined keywords into a list
    """
    return [keyword.strip() for keyword in keywords.lower().split(",")]

def retrieve_user_keywords():
    """
    Retrieve user-defined keywords from the environment variable
    """
    keywords = os.getenv("USER_KEYWORDS")
    if not keywords:
        print("USER_KEYWORDS is not set. Please set it in your environment.")
        sys.exit(1)
    return keyword_preprocessing(keywords)


def classify_email(email_content: str, keywords: List[str], client):
    KEYWORDS = ", ".join(keywords)

    PROMPT_CLASSIFICATION = f"""
    You are an email classification assistant. Your task is to analyze the content of emails and categorize them based on the following user-defined keywords:

    [{KEYWORDS}] 
    
    Example:
    - Urgent: emergency, asap, immediate, critical, deadline
    - Marketing: promotion, discount, offer, campaign, subscribe
    - Technical: error, bug, issue, troubleshoot, support
    - Personal: family, holiday, birthday, congratulations, weekend

    Instructions:
    1. Analyze the full email content provided
    2. Identify any keywords that match the predefined categories
    3. Classify the email into the most appropriate category
    4. If multiple categories apply, report them all
    5. If no keywords match, classify as "General" 

    Return your classification in this format:
    CATEGORY: <determined categories (seperated by commas)>
    CONFIDENCE: <high/medium/low based on keyword density>
    
    Example:
    
    User Keywords: Urgent, Marketing, Technical, Personal
    
    Email Content:
    Greetings,
    
    This is the newsletter for this week. We have a special promotion for our loyal customers. Don't miss out on the discount offer!
    We also have a emergency announcement regarding a critical issue with our service.
    Please check your inbox for more details.
    Thank you!
    
    Output:
    CATEGORY: Urgent, Marketing
    CONFIDENCE: high
    

    Do not include any additional explanation or analysis in your response.
    """
    response = call_llm(client, content=email_content,instruction = PROMPT_CLASSIFICATION, model="gemini-2.0-flash")
    return response

In [7]:
with open('/Users/natehu/Desktop/QTM 329 Comp Ling/EmaiLLM/data/qtm_emails.json', 'r') as file:
    email_data = json.load(file)



# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

def preprocess_email(email_text):
    # Convert emails to lower case
    email_text = email_text.lower()
    
    # Tokenize the email text
    doc = nlp(email_text)
    
    # Lemmatize, remove stopwords, and punctuation
    lemmatized_tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS and token.text not in string.punctuation]
    
    # Join the lemmatized tokens back into a string
    processed_text = " ".join(lemmatized_tokens)
    
    return processed_text

In [8]:
user_keyword = retrieve_user_keywords()
user_keyword

['networking', 'internship', 'job fair', 'interview']

In [9]:
index = [0,24,44,65,123]
output = []

for i in index:
    email = email_data[i]['content']
    email = preprocess_email(email)
    result = classify_email(email, user_keyword, client)
    output.append(result)

In [10]:
output

['CATEGORY: networking, internship\nCONFIDENCE: high\n',
 'CATEGORY: internship\nCONFIDENCE: high\n',
 'CATEGORY: networking\nCONFIDENCE: medium\n',
 'CATEGORY: internship\nCONFIDENCE: high\n',
 'CATEGORY: networking\nCONFIDENCE: medium\n']

In [11]:
print(email_data[44]['content'])

SPECIAL ANNOUNCMENT FROM THE BIOLOGY DEPARTMENT
 
 
Are you a QSS-BIO major, or considering the QSS-BIO track?
 
Come this Friday afternoon (3:30 PM)  in the QTM Department – room PAIS 561 conference room  on the 5th floor 
 
- to meet fellow QSS-BIO majors over pizza!
 
This student-led event is a great opportunity to meet others in your track and hear about their experiences and plans. 
 
More details, GroupMe and RSVP links in the attached flyer.
 
Hope to see you there,
The QSS-BIO club
 
See attached flyer for details 
 
Sadie Hannans
Undergraduate Program Coordinator
Department of Quantitative Theory & Methods
Emory University
Email: shanna9@emory.edu | 470-620-7981
(she/her/hers)


In [12]:
import json

In [14]:
len(email_data)

139

In [15]:
email_filtered_130 = email_data[:130]

In [18]:
# store two set of emails half and half and store as json
emailGroup1 = email_filtered_130[:65]
emailGroup2 = email_filtered_130[65:]
with open('/Users/natehu/Desktop/QTM 329 Comp Ling/EmaiLLM/data/emailGroup1.json', 'w') as f:
    json.dump(emailGroup1, f, indent=4)
with open('/Users/natehu/Desktop/QTM 329 Comp Ling/EmaiLLM/data/emailGroup2.json', 'w') as f:
    json.dump(emailGroup2, f, indent=4)