In [1]:
from google import genai
from google.genai import types
from typing import List
import sys
from dotenv import load_dotenv
import os
import json
from typing import Dict, Any   
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from token_tracker import record_token_usage, save_token_usage, display_token_usage_summary
import string

In [97]:
load_dotenv()

def setup_client():
    """
    setting up the client for google genai
    """
    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
    GOOGLE_API_KEY = ''
    if not GOOGLE_API_KEY:
        print("GOOGLE_API_KEY is not set. Please set it in your environment.")
        sys.exit(1)
    client = genai.Client(api_key=GOOGLE_API_KEY)
    return client


try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading spaCy model...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

In [227]:
def preprocess_email(email_text: str) -> str:
    """
    Preprocess email text by tokenizing, lemmatizing, and removing stopwords.
    
    Args:
        email_text: The raw email text
        
    Returns:
        Preprocessed email text
    """
    # Convert emails to lower case
    email_text = email_text.lower()
    
    # Tokenize the email text
    doc = nlp(email_text)
    
    # Lemmatize, remove stopwords, and punctuation
    lemmatized_tokens = [token.lemma_ for token in doc if token.text not in STOP_WORDS and token.text not in string.punctuation]
    
    # Join the lemmatized tokens back into a string
    processed_text = " ".join(lemmatized_tokens)
    
    return processed_text

def keyword_preprocessing(keywords: str) -> List[str]:
    """
    Preprocess the user-defined keywords into a list.
    
    Args:
        keywords: Comma-separated string of keywords
        
    Returns:
        List of cleaned keywords
    """
    return [keyword.strip() for keyword in keywords.lower().split(",")]

def call_llm(client, content: str, instruction: str, model: str = "gemini-2.0-flash") -> str:
    """
    Call the language model to generate a response based on the content and instruction.
    
    Args:
        client: The generative AI client
        content: The email content to be classified
        instruction: The system instruction for the model
        model: The model to use for generation
        
    Returns:
        The model's response text
    """
    try:
        print(f"Calling LLM with {len(content)} characters of content")
        response = client.models.generate_content(
        model=model,
        contents=[content], # here should the content of email be
        config=types.GenerateContentConfig(
            max_output_tokens=1024,
            temperature=0.1,
            system_instruction= instruction,
        )
        )
        record_token_usage(content, instruction, model, response)
        if hasattr(response, 'text'):
            return response.text
        elif hasattr(response, 'candidates') and response.candidates:
            return response.candidates[0].content.parts[0].text
        else:
            print("Unexpected response format:", response)
            return str(response)
    except Exception as e:
        print(f"Error calling LLM: {str(e)}")
        return f"Error: {str(e)}"

def classify_email(email_content: str, keywords: List[str], client, number_of_keywords: int) -> Dict[str, Any]:
    """
    Classify an email based on its content and user-defined keywords.
    
    Args:
        email_content: The preprocessed email content
        keywords: List of user-defined keywords
        client: The generative AI client
        
    Returns:
        Classification result dictionary
    """
    KEYWORDS = keywords
    print(KEYWORDS)

    PROMPT_CLASSIFICATION = f"""
    You are an email classification assistant. Your task is to analyze the content of emails and identify which of the following keywords are relevant to the email:

    [{KEYWORDS}] 

    
    Instructions:
    1. Analyze the full email content provided
    2. Identify any keywords from the list that are relevant to the email, and only return one keyword that is most closely related to the email
    3. Return ONLY one relevant keyword
    4. If no keywords match, return "Non"
    5. The keywords should be assigned only if the cotent of email is closely related to the keywords!
    
    
    
    Return your classification in this format:
    KEYWORDS: <relevant keywords (separated by commas) or Non>
    
    
    
    Do not include any additional explanation or analysis in your response.
    """
    response = call_llm(client, content=email_content, instruction=PROMPT_CLASSIFICATION)
    
    # Parse the result
    try:
        keywords_line = next((line for line in response.strip().split('\n') if line.startswith('KEYWORDS:')), '')
        found_keywords = keywords_line.replace('KEYWORDS:', '').strip()
        
        if found_keywords.upper() == 'NONE':
            return {
                'relevant_keywords': [],
                'raw_result': response
            }
        else:
            return {
                'relevant_keywords': [kw.strip().lower() for kw in found_keywords.split(',')],
                'raw_result': response
            }
    except Exception as e:
        print(f"Error parsing classification result: {str(e)}")
        return {
            'relevant_keywords': [],
            'raw_result': response,
            'error': str(e)
        }


def classify_email_controlled(email_content: str, keywords: List[str], client, number_of_keywords: int) -> Dict[str, Any]:
    """
    Classify an email based on its content and user-defined keywords.
    
    Args:
        email_content: The preprocessed email content
        keywords: List of user-defined keywords
        client: The generative AI client
        
    Returns:
        Classification result dictionary
    """
    KEYWORDS = keywords
    print(KEYWORDS)

    PROMPT_CLASSIFICATION = f"""
    You are an email classification assistant. Your task is to analyze the content of emails and identify which of the following keywords are relevant to the email:

    [{KEYWORDS}] 

    
    Instructions:
    1. Analyze the full email content provided
    2. Identify any keywords from the list that are relevant to the email, and only return one keyword that is most closely related to the email
    3. Return ONLY one relevant keyword
    4. If no keywords match, return "Non"
    5. The keywords should be assigned only if the cotent of email is closely related to the keywords!
    
    
    
    Return your classification in this format:
    KEYWORDS: <relevant keywords (separated by commas) or Non>
    
    
    
    Do not include any additional explanation or analysis in your response.
    """
    response = call_llm(client, content=email_content, instruction=PROMPT_CLASSIFICATION)
    
    # Parse the result
    try:
        keywords_line = next((line for line in response.strip().split('\n') if line.startswith('KEYWORDS:')), '')
        found_keywords = keywords_line.replace('KEYWORDS:', '').strip()
        
        if found_keywords.upper() == 'NONE':
            return {
                'relevant_keywords': [],
                'raw_result': response
            }
        else:
            return {
                'relevant_keywords': [kw.strip().lower() for kw in found_keywords.split(',')],
                'raw_result': response
            }
    except Exception as e:
        print(f"Error parsing classification result: {str(e)}")
        return {
            'relevant_keywords': [],
            'raw_result': response,
            'error': str(e)
        }

In [213]:
client = setup_client()

In [105]:
with open('/Users/natehu/Desktop/QTM 329 Comp Ling/EmaiLLM/data/emailGroup1.json', 'r') as f:
    email1 = json.load(f)
with open('/Users/natehu/Desktop/QTM 329 Comp Ling/EmaiLLM/data/emailGroup2_labelled.json', 'r') as f:
    email2 = json.load(f)
    
len(email2), len(email1)

(65, 65)

In [106]:
email1[0]

{'sender_name': 'Hannans, Sadie Marie',
 'sender_email': 'sadie.marie.hannans@EMORY.EDU',
 'subject': 'More Announcements to consider - time sensitive',
 'date': 'March 28, 2025 06:12:21 PM GMT',
 'recipients': 'QSSMAJORS@LISTSERV.CC.EMORY.EDU',
 'reply_to': 'sadie.marie.hannans@EMORY.EDU',
 'content': 'On behalf of Emory’s Women in STEM organization, You are invited to their annual Networking Night!\n Join us on Wednesday, April 2nd, from 6-8 PMin the Math and Science Center (MSC) E208. This event will feature a panel of women professionals from STEM fields, who will share their experiences and discuss what it’s like to work in these roles, as well as their journeys as women in STEM. The event will be informal and relaxed, with pizza and refreshments provided. It’s a fantastic opportunity to meet professors, graduate students, and network with others in our community.\nIf you’re interested in attending, please RSVP here: https://tr.ee/v9QfhkjLRQ\n       _______________________________

In [102]:
all_emails = email1 + email2

cleaned_emails = []
for email in all_emails:
    cat = [item.strip().lower() for item in email['category'].split(',')]
    # print(cat)
    cats = []
    for c in cat:
        if c == 'admin':
            c = 'administration'
        if c == '':
            c = 'Non' 
        cats.append(c)
    
    cleaned_emails.append({
        'sender': email['sender_name'],
        'sender_email': email['sender_email'],
        'date': email['date'],
        'subject': email['subject'],
        'content': email['content'],
        'category': cats
    })

In [90]:
# save cleaned_emails to a json file
with open('/Users/natehu/Desktop/QTM 329 Comp Ling/EmaiLLM/data/qtm_emails_final_version.json', 'w') as f:
    json.dump(cleaned_emails, f, indent=4)


    


130

In [2]:
with open('/Users/natehu/Desktop/QTM 329 Comp Ling/EmaiLLM/data/qtm_emails_final_version.json', 'r') as f:
    all_qtm_emails = json.load(f)
    
len(all_qtm_emails)

130

In [91]:
import tqdm

In [117]:
keywords = []
for email in all_qtm_emails:
    # print(email['category'])
    keywords.extend(email['category'])
keyword = list(set(keywords))
keyword.remove('Non')
keyword

finalkeywords = ", ".join(keyword)
finalkeywords

'careers, administration, research, training, events, academics'

In [228]:
output = []

for email in tqdm.tqdm(all_qtm_emails): 
    content = email['subject'] + ' ' + email['content']
    len_keywords = len(set(email['category']))
    classification = classify_email(content, finalkeywords, client, len_keywords)
    output.append({
        'predicted_classification': classification
    })

  0%|          | 0/130 [00:00<?, ?it/s]

careers, administration, research, training, events, academics
Calling LLM with 1927 characters of content


  1%|          | 1/130 [00:00<01:15,  1.72it/s]

careers, administration, research, training, events, academics
Calling LLM with 1508 characters of content


  2%|▏         | 2/130 [00:01<01:02,  2.03it/s]

careers, administration, research, training, events, academics
Calling LLM with 985 characters of content


  2%|▏         | 3/130 [00:01<00:53,  2.40it/s]

careers, administration, research, training, events, academics
Calling LLM with 1403 characters of content


  3%|▎         | 4/130 [00:01<00:49,  2.52it/s]

careers, administration, research, training, events, academics
Calling LLM with 8260 characters of content


  4%|▍         | 5/130 [00:02<00:48,  2.56it/s]

careers, administration, research, training, events, academics
Calling LLM with 1505 characters of content


  5%|▍         | 6/130 [00:02<00:51,  2.42it/s]

careers, administration, research, training, events, academics
Calling LLM with 913 characters of content


  5%|▌         | 7/130 [00:02<00:52,  2.35it/s]

careers, administration, research, training, events, academics
Calling LLM with 959 characters of content


  6%|▌         | 8/130 [00:03<00:48,  2.53it/s]

careers, administration, research, training, events, academics
Calling LLM with 1180 characters of content


  7%|▋         | 9/130 [00:03<00:49,  2.44it/s]

careers, administration, research, training, events, academics
Calling LLM with 568 characters of content


  8%|▊         | 10/130 [00:04<00:49,  2.43it/s]

careers, administration, research, training, events, academics
Calling LLM with 1595 characters of content


  8%|▊         | 11/130 [00:04<00:49,  2.39it/s]

careers, administration, research, training, events, academics
Calling LLM with 509 characters of content


  9%|▉         | 12/130 [00:04<00:47,  2.48it/s]

careers, administration, research, training, events, academics
Calling LLM with 5233 characters of content


 10%|█         | 13/130 [00:05<00:47,  2.45it/s]

careers, administration, research, training, events, academics
Calling LLM with 1329 characters of content


 11%|█         | 14/130 [00:05<00:47,  2.44it/s]

careers, administration, research, training, events, academics
Calling LLM with 4279 characters of content


 12%|█▏        | 15/130 [00:06<00:46,  2.48it/s]

careers, administration, research, training, events, academics
Calling LLM with 6628 characters of content


 12%|█▏        | 16/130 [00:06<00:47,  2.42it/s]

careers, administration, research, training, events, academics
Calling LLM with 1044 characters of content


 13%|█▎        | 17/130 [00:07<00:47,  2.39it/s]

careers, administration, research, training, events, academics
Calling LLM with 1002 characters of content


 14%|█▍        | 18/130 [00:07<00:47,  2.34it/s]

careers, administration, research, training, events, academics
Calling LLM with 8862 characters of content


 15%|█▍        | 19/130 [00:07<00:47,  2.33it/s]

careers, administration, research, training, events, academics
Calling LLM with 3765 characters of content


 15%|█▌        | 20/130 [00:08<00:48,  2.27it/s]

careers, administration, research, training, events, academics
Calling LLM with 1124 characters of content


 16%|█▌        | 21/130 [00:08<00:45,  2.40it/s]

careers, administration, research, training, events, academics
Calling LLM with 1097 characters of content


 17%|█▋        | 22/130 [00:09<00:43,  2.47it/s]

careers, administration, research, training, events, academics
Calling LLM with 1201 characters of content


 18%|█▊        | 23/130 [00:09<00:47,  2.26it/s]

careers, administration, research, training, events, academics
Calling LLM with 356 characters of content


 18%|█▊        | 24/130 [00:10<00:42,  2.48it/s]

careers, administration, research, training, events, academics
Calling LLM with 5182 characters of content


 19%|█▉        | 25/130 [00:10<00:42,  2.47it/s]

careers, administration, research, training, events, academics
Calling LLM with 2428 characters of content


 20%|██        | 26/130 [00:10<00:41,  2.48it/s]

careers, administration, research, training, events, academics
Calling LLM with 545 characters of content


 21%|██        | 27/130 [00:11<00:41,  2.48it/s]

careers, administration, research, training, events, academics
Calling LLM with 800 characters of content


 22%|██▏       | 28/130 [00:11<00:38,  2.62it/s]

careers, administration, research, training, events, academics
Calling LLM with 3800 characters of content


 22%|██▏       | 29/130 [00:11<00:39,  2.59it/s]

careers, administration, research, training, events, academics
Calling LLM with 7002 characters of content


 23%|██▎       | 30/130 [00:12<00:38,  2.56it/s]

careers, administration, research, training, events, academics
Calling LLM with 2421 characters of content


 24%|██▍       | 31/130 [00:12<00:36,  2.71it/s]

careers, administration, research, training, events, academics
Calling LLM with 2168 characters of content


 25%|██▍       | 32/130 [00:13<00:37,  2.59it/s]

careers, administration, research, training, events, academics
Calling LLM with 5524 characters of content


 25%|██▌       | 33/130 [00:13<00:38,  2.49it/s]

careers, administration, research, training, events, academics
Calling LLM with 717 characters of content


 26%|██▌       | 34/130 [00:13<00:39,  2.45it/s]

careers, administration, research, training, events, academics
Calling LLM with 860 characters of content


 27%|██▋       | 35/130 [00:14<00:38,  2.45it/s]

careers, administration, research, training, events, academics
Calling LLM with 4256 characters of content


 28%|██▊       | 36/130 [00:14<00:39,  2.37it/s]

careers, administration, research, training, events, academics
Calling LLM with 157 characters of content


 28%|██▊       | 37/130 [00:15<00:36,  2.54it/s]

careers, administration, research, training, events, academics
Calling LLM with 1860 characters of content


 29%|██▉       | 38/130 [00:15<00:36,  2.54it/s]

careers, administration, research, training, events, academics
Calling LLM with 1018 characters of content


 30%|███       | 39/130 [00:15<00:36,  2.49it/s]

careers, administration, research, training, events, academics
Calling LLM with 983 characters of content


 31%|███       | 40/130 [00:16<00:35,  2.56it/s]

careers, administration, research, training, events, academics
Calling LLM with 6846 characters of content


 32%|███▏      | 41/130 [00:16<00:35,  2.53it/s]

careers, administration, research, training, events, academics
Calling LLM with 774 characters of content


 32%|███▏      | 42/130 [00:17<00:35,  2.51it/s]

careers, administration, research, training, events, academics
Calling LLM with 653 characters of content


 33%|███▎      | 43/130 [00:17<00:35,  2.47it/s]

careers, administration, research, training, events, academics
Calling LLM with 791 characters of content


 34%|███▍      | 44/130 [00:17<00:33,  2.60it/s]

careers, administration, research, training, events, academics
Calling LLM with 705 characters of content


 35%|███▍      | 45/130 [00:18<00:31,  2.66it/s]

careers, administration, research, training, events, academics
Calling LLM with 512 characters of content


 35%|███▌      | 46/130 [00:18<00:31,  2.65it/s]

careers, administration, research, training, events, academics
Calling LLM with 569 characters of content


 36%|███▌      | 47/130 [00:19<00:32,  2.59it/s]

careers, administration, research, training, events, academics
Calling LLM with 3850 characters of content


 37%|███▋      | 48/130 [00:19<00:31,  2.58it/s]

careers, administration, research, training, events, academics
Calling LLM with 1220 characters of content


 38%|███▊      | 49/130 [00:19<00:30,  2.66it/s]

careers, administration, research, training, events, academics
Calling LLM with 330 characters of content


 38%|███▊      | 50/130 [00:20<00:30,  2.64it/s]

careers, administration, research, training, events, academics
Calling LLM with 378 characters of content


 39%|███▉      | 51/130 [00:20<00:31,  2.51it/s]

careers, administration, research, training, events, academics
Calling LLM with 1825 characters of content


 40%|████      | 52/130 [00:20<00:29,  2.61it/s]

careers, administration, research, training, events, academics
Calling LLM with 456 characters of content


 41%|████      | 53/130 [00:21<00:30,  2.49it/s]

careers, administration, research, training, events, academics
Calling LLM with 2187 characters of content


 42%|████▏     | 54/130 [00:21<00:29,  2.55it/s]

careers, administration, research, training, events, academics
Calling LLM with 419 characters of content


 42%|████▏     | 55/130 [00:22<00:29,  2.57it/s]

careers, administration, research, training, events, academics
Calling LLM with 1844 characters of content


 43%|████▎     | 56/130 [00:22<00:30,  2.42it/s]

careers, administration, research, training, events, academics
Calling LLM with 5503 characters of content


 44%|████▍     | 57/130 [00:22<00:29,  2.51it/s]

careers, administration, research, training, events, academics
Calling LLM with 252 characters of content


 45%|████▍     | 58/130 [00:23<00:29,  2.48it/s]

careers, administration, research, training, events, academics
Calling LLM with 357 characters of content


 45%|████▌     | 59/130 [00:23<00:28,  2.53it/s]

careers, administration, research, training, events, academics
Calling LLM with 210 characters of content


 46%|████▌     | 60/130 [00:24<00:26,  2.66it/s]

careers, administration, research, training, events, academics
Calling LLM with 664 characters of content


 47%|████▋     | 61/130 [00:24<00:24,  2.76it/s]

careers, administration, research, training, events, academics
Calling LLM with 249 characters of content


 48%|████▊     | 62/130 [00:24<00:25,  2.65it/s]

careers, administration, research, training, events, academics
Calling LLM with 850 characters of content


 48%|████▊     | 63/130 [00:25<00:25,  2.63it/s]

careers, administration, research, training, events, academics
Calling LLM with 2106 characters of content


 49%|████▉     | 64/130 [00:25<00:24,  2.69it/s]

careers, administration, research, training, events, academics
Calling LLM with 1118 characters of content


 50%|█████     | 65/130 [00:25<00:24,  2.66it/s]

careers, administration, research, training, events, academics
Calling LLM with 1979 characters of content


 51%|█████     | 66/130 [00:26<00:24,  2.59it/s]

careers, administration, research, training, events, academics
Calling LLM with 1397 characters of content


 52%|█████▏    | 67/130 [00:26<00:25,  2.50it/s]

careers, administration, research, training, events, academics
Calling LLM with 1187 characters of content


 52%|█████▏    | 68/130 [00:27<00:25,  2.45it/s]

careers, administration, research, training, events, academics
Calling LLM with 963 characters of content


 53%|█████▎    | 69/130 [00:27<00:23,  2.62it/s]

careers, administration, research, training, events, academics
Calling LLM with 1763 characters of content


 54%|█████▍    | 70/130 [00:27<00:23,  2.59it/s]

careers, administration, research, training, events, academics
Calling LLM with 1048 characters of content


 55%|█████▍    | 71/130 [00:28<00:22,  2.58it/s]

careers, administration, research, training, events, academics
Calling LLM with 757 characters of content


 55%|█████▌    | 72/130 [00:28<00:22,  2.61it/s]

careers, administration, research, training, events, academics
Calling LLM with 1196 characters of content


 56%|█████▌    | 73/130 [00:29<00:20,  2.80it/s]

careers, administration, research, training, events, academics
Calling LLM with 1719 characters of content


 57%|█████▋    | 74/130 [00:29<00:20,  2.76it/s]

careers, administration, research, training, events, academics
Calling LLM with 2125 characters of content


 58%|█████▊    | 75/130 [00:29<00:19,  2.79it/s]

careers, administration, research, training, events, academics
Calling LLM with 885 characters of content


 58%|█████▊    | 76/130 [00:30<00:19,  2.79it/s]

careers, administration, research, training, events, academics
Calling LLM with 1005 characters of content


 59%|█████▉    | 77/130 [00:30<00:19,  2.69it/s]

careers, administration, research, training, events, academics
Calling LLM with 1360 characters of content


 60%|██████    | 78/130 [00:30<00:18,  2.74it/s]

careers, administration, research, training, events, academics
Calling LLM with 1516 characters of content


 61%|██████    | 79/130 [00:31<00:22,  2.27it/s]

careers, administration, research, training, events, academics
Calling LLM with 910 characters of content


 62%|██████▏   | 80/130 [00:31<00:20,  2.48it/s]

careers, administration, research, training, events, academics
Calling LLM with 1342 characters of content


 62%|██████▏   | 81/130 [00:32<00:19,  2.48it/s]

careers, administration, research, training, events, academics
Calling LLM with 1975 characters of content


 63%|██████▎   | 82/130 [00:32<00:18,  2.55it/s]

careers, administration, research, training, events, academics
Calling LLM with 3745 characters of content


 64%|██████▍   | 83/130 [00:32<00:18,  2.59it/s]

careers, administration, research, training, events, academics
Calling LLM with 212 characters of content


 65%|██████▍   | 84/130 [00:33<00:18,  2.54it/s]

careers, administration, research, training, events, academics
Calling LLM with 516 characters of content


 65%|██████▌   | 85/130 [00:33<00:17,  2.52it/s]

careers, administration, research, training, events, academics
Calling LLM with 7516 characters of content


 66%|██████▌   | 86/130 [00:34<00:18,  2.40it/s]

careers, administration, research, training, events, academics
Calling LLM with 835 characters of content


 67%|██████▋   | 87/130 [00:34<00:17,  2.39it/s]

careers, administration, research, training, events, academics
Calling LLM with 562 characters of content


 68%|██████▊   | 88/130 [00:35<00:17,  2.40it/s]

careers, administration, research, training, events, academics
Calling LLM with 1047 characters of content


 68%|██████▊   | 89/130 [00:35<00:16,  2.42it/s]

careers, administration, research, training, events, academics
Calling LLM with 233 characters of content


 69%|██████▉   | 90/130 [00:35<00:16,  2.36it/s]

careers, administration, research, training, events, academics
Calling LLM with 903 characters of content


 70%|███████   | 91/130 [00:36<00:16,  2.35it/s]

careers, administration, research, training, events, academics
Calling LLM with 727 characters of content


 71%|███████   | 92/130 [00:36<00:15,  2.46it/s]

careers, administration, research, training, events, academics
Calling LLM with 8669 characters of content


 72%|███████▏  | 93/130 [00:37<00:14,  2.57it/s]

careers, administration, research, training, events, academics
Calling LLM with 2341 characters of content


 72%|███████▏  | 94/130 [00:37<00:14,  2.51it/s]

careers, administration, research, training, events, academics
Calling LLM with 2065 characters of content


 73%|███████▎  | 95/130 [00:37<00:13,  2.62it/s]

careers, administration, research, training, events, academics
Calling LLM with 5526 characters of content


 74%|███████▍  | 96/130 [00:38<00:12,  2.62it/s]

careers, administration, research, training, events, academics
Calling LLM with 622 characters of content


 75%|███████▍  | 97/130 [00:38<00:12,  2.72it/s]

careers, administration, research, training, events, academics
Calling LLM with 3868 characters of content


 75%|███████▌  | 98/130 [00:38<00:11,  2.69it/s]

careers, administration, research, training, events, academics
Calling LLM with 1483 characters of content


 76%|███████▌  | 99/130 [00:39<00:11,  2.59it/s]

careers, administration, research, training, events, academics
Calling LLM with 418 characters of content


 77%|███████▋  | 100/130 [00:39<00:11,  2.61it/s]

careers, administration, research, training, events, academics
Calling LLM with 3888 characters of content


 78%|███████▊  | 101/130 [00:40<00:11,  2.45it/s]

careers, administration, research, training, events, academics
Calling LLM with 679 characters of content


 78%|███████▊  | 102/130 [00:40<00:11,  2.49it/s]

careers, administration, research, training, events, academics
Calling LLM with 7816 characters of content


 79%|███████▉  | 103/130 [00:40<00:10,  2.46it/s]

careers, administration, research, training, events, academics
Calling LLM with 676 characters of content


 80%|████████  | 104/130 [00:41<00:10,  2.55it/s]

careers, administration, research, training, events, academics
Calling LLM with 1129 characters of content


 81%|████████  | 105/130 [00:41<00:09,  2.64it/s]

careers, administration, research, training, events, academics
Calling LLM with 562 characters of content


 82%|████████▏ | 106/130 [00:42<00:09,  2.60it/s]

careers, administration, research, training, events, academics
Calling LLM with 405 characters of content


 82%|████████▏ | 107/130 [00:42<00:08,  2.72it/s]

careers, administration, research, training, events, academics
Calling LLM with 5920 characters of content


 83%|████████▎ | 108/130 [00:42<00:08,  2.57it/s]

careers, administration, research, training, events, academics
Calling LLM with 960 characters of content


 84%|████████▍ | 109/130 [00:43<00:08,  2.49it/s]

careers, administration, research, training, events, academics
Calling LLM with 210 characters of content


 85%|████████▍ | 110/130 [00:43<00:08,  2.49it/s]

careers, administration, research, training, events, academics
Calling LLM with 5417 characters of content


 85%|████████▌ | 111/130 [00:44<00:07,  2.46it/s]

careers, administration, research, training, events, academics
Calling LLM with 3911 characters of content


 86%|████████▌ | 112/130 [00:44<00:07,  2.56it/s]

careers, administration, research, training, events, academics
Calling LLM with 1015 characters of content


 87%|████████▋ | 113/130 [00:44<00:06,  2.73it/s]

careers, administration, research, training, events, academics
Calling LLM with 6255 characters of content


 88%|████████▊ | 114/130 [00:45<00:05,  2.71it/s]

careers, administration, research, training, events, academics
Calling LLM with 6500 characters of content


 88%|████████▊ | 115/130 [00:45<00:06,  2.49it/s]

careers, administration, research, training, events, academics
Calling LLM with 11670 characters of content


 89%|████████▉ | 116/130 [00:46<00:05,  2.44it/s]

careers, administration, research, training, events, academics
Calling LLM with 5894 characters of content


 90%|█████████ | 117/130 [00:46<00:05,  2.46it/s]

careers, administration, research, training, events, academics
Calling LLM with 1793 characters of content


 91%|█████████ | 118/130 [00:46<00:05,  2.30it/s]

careers, administration, research, training, events, academics
Calling LLM with 7351 characters of content


 92%|█████████▏| 119/130 [00:47<00:04,  2.41it/s]

careers, administration, research, training, events, academics
Calling LLM with 6745 characters of content


 92%|█████████▏| 120/130 [00:47<00:03,  2.60it/s]

careers, administration, research, training, events, academics
Calling LLM with 357 characters of content


 93%|█████████▎| 121/130 [00:47<00:03,  2.71it/s]

careers, administration, research, training, events, academics
Calling LLM with 292 characters of content


 94%|█████████▍| 122/130 [00:48<00:03,  2.62it/s]

careers, administration, research, training, events, academics
Calling LLM with 5359 characters of content


 95%|█████████▍| 123/130 [00:48<00:02,  2.47it/s]

careers, administration, research, training, events, academics
Calling LLM with 5359 characters of content


 95%|█████████▌| 124/130 [00:49<00:02,  2.45it/s]

careers, administration, research, training, events, academics
Calling LLM with 322 characters of content


 96%|█████████▌| 125/130 [00:49<00:01,  2.51it/s]

careers, administration, research, training, events, academics
Calling LLM with 503 characters of content


 97%|█████████▋| 126/130 [00:50<00:01,  2.47it/s]

careers, administration, research, training, events, academics
Calling LLM with 14660 characters of content


 98%|█████████▊| 127/130 [00:50<00:01,  2.36it/s]

careers, administration, research, training, events, academics
Calling LLM with 574 characters of content


 98%|█████████▊| 128/130 [00:50<00:00,  2.43it/s]

careers, administration, research, training, events, academics
Calling LLM with 1206 characters of content


 99%|█████████▉| 129/130 [00:51<00:00,  2.58it/s]

careers, administration, research, training, events, academics
Calling LLM with 9289 characters of content


100%|██████████| 130/130 [00:51<00:00,  2.52it/s]


In [215]:
display_token_usage_summary()

Total API calls: 1046
Total tokens used: 2387757
Total prompt tokens: 2379500
Total completion tokens: 8257


In [229]:
thetype = '0shot_onekeyword_generated'
with open(f'/Users/natehu/Desktop/QTM 329 Comp Ling/EmaiLLM/qtm_output_{thetype}.json', 'w') as f:
    json.dump(output, f, indent=4)

In [27]:
with open('/Users/natehu/Desktop/QTM 329 Comp Ling/EmaiLLM/qtm_output_0shot_onekeyword_generated.json', 'r') as f:
    output = json.load(f)

In [28]:
length = len(all_qtm_emails)
def calcualte_acc(predicted_keywords, true_keywords):
    intersection = len(set(predicted_keywords) & set(true_keywords))
    return 1 if intersection > 0 else 0
total_acc = 0
for i in range(length):
    predicted = output[i]["predicted_classification"]["relevant_keywords"]
    
    true = all_qtm_emails[i]["category"]  
    total_acc += calcualte_acc(predicted, true)

average_acc = total_acc / length
print(average_acc)

0.5153846153846153


In [29]:
length = int(len(all_qtm_emails))

def calculate_jaccard(predicted_keywords, true_keywords):
    
    intersection = len(set(predicted_keywords) & set(true_keywords))
    union = len(set(predicted_keywords) | set(true_keywords))
    
    return intersection / union if union > 0 else 1.0  # If both sets empty, similarity is 1
    return intersection / union if union > 0 else 1.0  # If both sets empty, similarity is 1

# Calculate average Jaccard similarity across all examples
total_jaccard = 0
for i in range(65, length):
    predicted = output[i]["predicted_classification"]["relevant_keywords"]
    
    true = all_qtm_emails[i]["category"]  
    total_jaccard += calculate_jaccard(predicted, true)

average_jaccard = total_jaccard / (length/2)

def calculate_f1(predicted_keywords, true_keywords):
    # Calculate precision and recall
    true_set = set(true_keywords)
    pred_set = set(predicted_keywords)
    
    intersection = len(true_set & pred_set)
    
    precision = intersection / len(pred_set) if len(pred_set) > 0 else 0
    recall = intersection / len(true_set) if len(true_set) > 0 else 0
    
    # Calculate F1
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

# Calculate average F1 across all examples
total_f1 = 0
for i in range(65, length):
    predicted = output[i]["predicted_classification"]["relevant_keywords"]
    true = all_qtm_emails[i]["category"]
    total_f1 += calculate_f1(predicted, true)

average_f1 = total_f1 / (length/2)

def calculate_precision(predicted_keywords, true_keywords):
    true_set = set(true_keywords)
    pred_set = set(predicted_keywords)
    
    intersection = len(true_set & pred_set)
    
    return intersection / len(pred_set) if len(pred_set) > 0 else 1.0  # If no predictions, precision is 1

# Calculate average precision
total_precision = 0
for i in range(65, length):
    predicted = output[i]["predicted_classification"]["relevant_keywords"]
    true = all_qtm_emails[i]["category"]
    total_precision += calculate_precision(predicted, true)

average_precision = total_precision / (length/2)


def calculate_recall(predicted_keywords, true_keywords):
    true_set = set(true_keywords)
    pred_set = set(predicted_keywords)
    
    intersection = len(true_set & pred_set)
    
    return intersection / len(true_set) if len(true_set) > 0 else 1.0  # If no true labels, recall is 1

# Calculate average recall
total_recall = 0
for i in range(65, length):
    predicted = output[i]["predicted_classification"]["relevant_keywords"]
    true = all_qtm_emails[i]["category"]
    total_recall += calculate_recall(predicted, true)

average_recall = total_recall / (length/2)

def calculate_custom_score(predicted_keywords, true_keywords):
    true_set = set(true_keywords)
    pred_set = set(predicted_keywords)
    
    # Calculate matches and mismatches
    matches = len(true_set & pred_set)
    mismatches = len(pred_set - true_set)  # Predicted but not true
    
    # Calculate score
    points = matches - mismatches
    total_possible = len(true_set)
    
    return points / total_possible if total_possible > 0 else 0  # Avoid division by zero

# Calculate average custom score
total_custom = 0
for i in range(65, length):
    predicted = output[i]["predicted_classification"]["relevant_keywords"]
    true = all_qtm_emails[i]["category"]
    total_custom += calculate_custom_score(predicted, true)

average_custom = total_custom / (length/2)

In [30]:
# print the average precision, recall, f1, and custom score
print(f"Average precision: {average_precision}")
print(f"Average jaccard: {average_jaccard}")
print(f"Average recall: {average_recall}")
print(f"Average F1: {average_f1}")
print(f"Average custom score: {average_custom}")

Average precision: 0.38461538461538464
Average jaccard: 0.38461538461538464
Average recall: 0.38461538461538464
Average F1: 0.38461538461538464
Average custom score: -0.23076923076923078


In [209]:
idx = 102
print(output[idx]['predicted_classification']['relevant_keywords'])
print(all_qtm_emails[idx]['category'])

['careers']
['events']


In [210]:
"For QTM Undergraduates\n\nFriday, \nFebruary 23, 2024\n\n\n\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\n\nQTM INFORMATION SESSION – MARCH 1ST at 12:30 PM -2:00 PM at PAIS 290 \nSEE SPECIAL ANNOUNCEMENT EMAIL FOR DETAILS\n\xa0\n\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\nDear Students,\nWe would like to encourage you to apply now to become an Emory Writing Center (EWC) tutor in the academic year 2024-2025. The EWC is recruiting students who are strong writers, listeners, and collaborators. We also believe that we are best able to serve the Emory community when our staff includes students with diverse backgrounds and with competencies in a range of languages and discourses.\xa0\nThe application deadline is March 1 for positions that will formally begin in August 2024. You can find application instructions on our website under “For Students”/“Work for Us”: https://writingcenter.emory.edu/students/work-for-us/index.html.\nThe EWC Team\n See attached flyer for details\Submitted by Jeff Mullis\n\xa0\n\xa0\n\xa0\n\xa0\n\nWant to participate?\nParticipant Applications: https://forms.office.com/r/bdBXrdEqU5\n\xa0\x \nSee flyer for details including QR code\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\nSubmitted Isabela Galoustian\n\xa0\n\xa0\n\xa0\nOpportunity\n\xa0\nSTATISTICAL STAFF ANALYST POSITION\n\xa0\nSee attached flyer for details \n\xa0\n\xa0\n\xa0\n\n\xa0\n\xa0\n\xa0\nMessage from\nMargaret Corcoran Recruitment\xa0 Director\xa0 with Green Corps. \nField School for Environmental Organizing \n\xa0\nWe are looking for people who are concerned by the state of the environment and who want to do something about it. Our yearlong program has prepared recent college graduates for 30 years to launch a career in environmental \xa0organizing and advocacy by providing hands-on experience running campaigns with groups like the Wilderness Society, Endangered Species Coalition, Mighty Earth and more. We then connect our graduates to jobs in the environmental and social change movement.\n\xa0\nGreen Corps Organizer Program - The Field School for Environmental Organizing\nGreen Corps is looking for college graduates who are ready to take on the biggest environmental challenges of our day.\nLearn more and apply on our website!\n\xa0\nThe planet needs all the help it can get. To win now and build a strong foundation for lasting progress, we need people who know how to organize: to run organizations and campaigns that will inspire the support and action we need to save our environment. Our program starts with intensive classroom training. You learn about issues and campaigns, organizing theory and principles, and skills from recruiting volunteers to staging a media event. Then, you move to a more hands-on experience. You make a difference on important campaigns to transition our country to clean energy, protect wildlife, and more. For example, Lauren Karpinski, an organizer from the Green Corps Class of 2019, brought together a coalition of 40 groups that helped win a landmark solar energy bill in Arkansas.\n\xa0\nWe’re accepting the top 20 candidates for our 2024-2025 program. Graduates will join our more than 400 alumni who are leading environmental campaigns and organizations across the country. If you’re passionate about the environment and ready to learn and practice the craft of organizing after graduation, apply today!\n\xa0\nNot a graduating senior? Add your name to our list to receive updates about Green Corps including summer internships and jobs in the environmental and broader social change field!\n\xa0\nGreen Corps’ year-long program begins in August 2024 with introductory classroom training, and continues with field placements in multiple locations across the U.S. Candidates must be willing to relocate. Please contact Margaret Corcoran at margaret@greencorps.org with additional questions.\n\xa0\nGreen Corps is part of The Public Interest Network, which operates and supports organizations committed to a shared vision of a better world and a\xa0 strategic approach to social change. Visit publicinterestnetwork.org to learn more. Green Corps is an equal opportunity employer. The target annual compensation for this position is $32,500 (but compensation may range between $32,500 and $39,000 depending on location). Green Corps' benefits package includes medical insurance for employees and dependents, needs-based student loan assistance, commuter benefit program, sick pay (60-72 hours/year depending on location), and 80 hours (2 weeks) of accrued vacation. We also offer short-term positions, an excellent training program, and opportunities for advancement.\n\xa0\nNeed more information contact her at (617) 901-6114.\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\nLUNCH & LEARN\n\xa0\nThe Emory Center for Digital Scholarship (ECDS) and the Envisioning Baroque Rome digital humanities project invite you for a one-hour round-table Lunch & Learn at noon\xa0on Thursday, March 21st in Woodruff Library.\nEnjoy a provided lunch while the project team discusses the process of building a 3D model of the city of Rome. Envisioning Baroque Rome began in 2013 with the goal of rebuilding Rome in 1676 according to the map of that year, Nuova piante et alzata della città di Roma, and other prints by artist Giovanni Battista Falda. Led by Sarah McPhee, Samuel Candler Dobbs Professor of Art History, the round-table will include ECDS staff Joanna Mundy, Ian Burr, and John Halbert, and artist Nicole Costello Matthews. The team will discuss the research, modeling, texturing, and animation of Baroque Rome, and you will have an opportunity to explore the newest version of the 3D city yourself!\n\xa0\nLearn about the recent research in the project and stroll the streets of Rome with Giovanni Battista Falda.\n\xa0\nWe hope that you are able to attend. To RSVP, fill out this form\xa0by Monday, March 18th. If you have any questions, please email Joanna Mundy (jcmundy@emory.edu).\n\xa0\nSubmitted by Joanna Mundy\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\n\n\xa0\nPrerequisites\nYou must complete ALL prerequisites listed in order to enroll in these courses.\nQTM 210\n\t•\tEither QTM 120 or MATH 210 or MATH 211\nQTM 220\n\t•\tQTM 110\n\t•\tQTM 150\n\t•\tQTM 210 or ECON 220 or MATH 361 with a plan to co-enroll in MATH 362\n\t•\tMATH 210 or MATH 211\n\t•\tMATH 221\nElective Prerequisites\n\t•\tQTM 385 prerequisites will be listed in the course description!!!\n\xa0\nEquivalents\nBelow are courses that act as equivalents to the respective QTM courses\nQTM 210:\n\t•\tECON 220 and [MATH 210 or MATH 211]\n\t•\tOR - MATH 362\nQTM 220:\n\t•\tECON 320 plus QTM 110, QTM 150, MATH 221 and [MATH 210 or MATH 211]\n\xa0\nFor General Academic Advising\nSchedule an appointment at\xa0https://calendly.com/emoryqtm.\n\xa0\nIf you have any questions regarding the above information, please do not hesitate to reach out.\n\xa0\nBest,\nSadie Hannans\nQTM Program Coordinator\nShanna9@emory.edu\xa0\xa0 470-620-7981\n\n\n\xa0\n\n\xa0\n\xa0\n\n\xa0\xa0\xa0\n\xa0\n\xa0\xa0\xa0\xa0\xa0\n\n\xa0\xa0\xa0\n\n\n\n\nPROFESSIONAL DEVELOPMENT\n\nQTM Preparation: A Pathway to Professionalism\n\xa0\nNew Resources Available\n\n\xa0\n\xa0Getting Started with SQL\n\n\xa0\nGit and GitHub\n\xa0\n\n\xa0\n\xa0\n\xa0\n\xa0\n\nQTM DataCamp Access\nQTM Access to DataCamp is now back in business! Sign up here.\n\n\n\xa0\n\n\xa0\n\xa0\n\xa0\n\n\n\xa0\n\n\xa0\xa0\n\xa0\n\n\n\xa0\nConnect with Emory Alumni\n\nEmory Connects is a platform sponsored by Emory Alumni and Engagement and is a space for current students and alumni to connect and network. Log in to\xa0Emory Connects\xa0today to see what is available!\n\xa0\n\nQTM DataCamp Access\n\nQTM Access to DataCamp is now back in business! Sign up here.\n\xa0\n\xa0\n\n\xa0\nLooking for a spring or summer internship or full-time job?\n\nCheck out the attached document for tips on internship/job searches. Positions are still being posted on Handshake and many are still accepting applications.\xa0Log on to Handshake\xa0to check out some of the roles!\n\n\xa0\xa0\n\xa0\n\xa0\n\xa0\nSadie Hannans\nUndergraduate Program Coordinator\nDepartment of Quantitative Theory & Methods\nEmory University\nEmail: shanna9@emory.edu | 470-620-7981\n(she/her/hers)"

"For QTM Undergraduates\n\nFriday, \nFebruary 23, 2024\n\n\n\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\n\n\nQTM INFORMATION SESSION – MARCH 1ST at 12:30 PM -2:00 PM at PAIS 290 \nSEE SPECIAL ANNOUNCEMENT EMAIL FOR DETAILS\n\xa0\n\n\xa0\n\xa0\n\xa0\n\xa0\n\xa0\nDear Students,\nWe would like to encourage you to apply now to become an Emory Writing Center (EWC) tutor in the academic year 2024-2025. The EWC is recruiting students who are strong writers, listeners, and collaborators. We also believe that we are best able to serve the Emory community when our staff includes students with diverse backgrounds and with competencies in a range of languages and discourses.\xa0\nThe application deadline is March 1 for positions that will formally begin in August 2024. You can find application instructions on our website under “For Students”/“Work for Us”: https://writingcenter.emory.edu/students/work-for-us/index.html.\nThe EWC Team\n\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\x

In [None]:
""" 

    Examples:
    
    Example 1:
    
    Content: "On behalf of Emory\u2019s Women in STEM organization, You are invited to their annual Networking Night!\n Join us on Wednesday, April 2nd, from 6-8 PMin the Math and Science Center (MSC) E208. This event will feature a panel of women professionals from STEM fields, who will share their experiences and discuss what it\u2019s like to work in these roles, as well as their journeys as women in STEM. The event will be informal and relaxed, with pizza and refreshments provided. It\u2019s a fantastic opportunity to meet professors, graduate students, and network with others in our community.\nIf you\u2019re interested in attending, please RSVP here: https://tr.ee/v9QfhkjLRQ\n       ____________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________\n  SUMMER RESEARCH PROGRAM: AI Xperience \nEmory\u2019s Center for AI Learning invites students toapply to AI.Xperience, its summer applied research program.In this program, students will have the opportunity to grow their data science and programming skills with hands-on learning. To be selected, students need to:\u00b7\n\t\u2022\tHave been enrolled in classes in the spring 2025 semester and be enrolled at Emory in the fall 2024 semester\n\t\u2022\tAttend 3 or 4 team meetings per week\n\t\u2022\tBe able to commit roughly 20 hours per week to the project over the 6-week period\n\t\u2022\tHave knowledge of common statistical analysis and machine learning methods\n\t\u2022\tHave experience programming in R and/or PythonAll interested students should applyby 11:59 PM on March 29, 2025.We look forward to reviewing your applications!\nAPPLY\n \n             Sadie Hannans\nUndergraduate Program Coordinator\nDepartment of Quantitative Theory & Methods\nEmory University\nEmail: shanna9@emory.edu | 470-620-7981\n(she/her/hers)"
    
    correct keywords: events, research
    
    Example 2:
    
    Content: "APPLICATION DEADLINE EXTENDED - MARCH 31ST IT IS NOT TOO LATE TO JOIN!!!\n LEARN MORE ABOUT THE QTM AMBASSADORS COHORT ON THE QTM WEBSITE\n https://quantitative.emory.edu/opportunities/ambassadors.html\n  APPLICATIONS ARE OPEN TO JOIN THE QTM DEPARTMENT AMBASSADORS TEAM FOR AY 2025 - 26. \n QTM Ambassadors are a selective cohort of sophomores, juniors, and seniors who are committed to service and leadership in the QTM community. As the department's student representatives, Ambassadors will engage with our external advisory board, assist with programming, andcultivate professional skills and relationships that will prove to be valuable long after graduation.\n Here are a few requirements and opportunities:\n \t\u2022\tMust be a QTM major or QTM minor\n\t\u2022\tMust have a strong sense of responsibility\n\t\u2022\tMust be available to meet on Fridays for 1 hour at least twice a month between the hours of 3:00 PM \u2013 4:30 PM\n\t\u2022\tNetwork with external partners\n\t\u2022\tSupport faculty and staff with QTM events\n\t\u2022\tBuild your professional portfolio \n\t\u2022\tFor more requirements and opportunities, see QTM website : https://quantitative.emory.edu/opportunities/ambassadors.html\n Application link is available below and on the QTM website. Applications are due March 31st \n  QTM Ambassador Application (2025 - 2026)\n We love for you to join us!\n  Sadie Hannans\nUndergraduate Program Coordinator\nDepartment of Quantitative Theory & Methods\nEmory University\nEmail: shanna9@emory.edu | 470-620-7981\n(she/her/hers)"
    
    correct keywords: careers
    
    
    Example 3:
    
    Content: "SPECIAL ANNOUNCEMENT\n   \n\nSadie Hannans\nUndergraduate Program Coordinator\nDepartment of Quantitative Theory & Methods\nEmory University\nEmail: shanna9@emory.edu | 470-620-7981\n(she/her/hers)"
    
    correct keywords: Non
    
    Example 4:
    
    "content": "SPECIAL OPPORTUNITY\n\u00a0\n\u00a0\n\u00a0\n\nKey Highlights:\n\t\u2022\tDates: Saturday, October 12th - Wednesday, October 17th\u00a0(*students are usually provided a letter\u00a0 Associate Dean of\u00a0 the Pathways Center\u00a0excused from classes on Wednesday as this will be a travel day)\u00a0\n\t\u2022\tLocation: San Francisco, California\n\t\u2022\tFocus: STEM Careers\n\t\u2022\tApplication Open: August 29th, 2024 (LIVE NOW!)\n\t\u2022\tApplication Deadline: Sunday, September 8th, 2024, at 11:59 PM\u00a0\nEncourage your students to apply!\u00a0This program provides them with:\n\t\u2022\tExposure to diverse STEM fields:\u00a0Explore various career paths and gain real-world knowledge from industry professionals.\n\t\u2022\tNetworking opportunities:\u00a0Connect with successful Emory alumni working in STEM fields.\n\t\u2022\tProfessional development:\u00a0Build valuable networking and presentation skills.\n\t\u2022\tImmersive experience:\u00a0Explore San Francisco and engage in unique activities.\nLearn More:\n\t\u2022\tFull program details and application information: https://pathways.emory.edu/opportunities/career-trek/index.html\nPlease encourage students to apply, and share this information across your Canvas courses, listservs, etc. If students have any questions, please feel free to reach out\u00a0to the Pathways Center at cpd@emory.edu.\u00a0\u00a0\n\u00a0\n\u00a0\nSadie Hannans\nUndergraduate Program Coordinator\nDepartment of Quantitative Theory & Methods\nEmory University\nEmail: shanna9@emory.edu | 470-620-7981\n(she/her/hers)"
    
    correct keywords: events
    
    
    Example 5:
    
    "content": "NEW CLASS ANNOUNCEMENTS\n\u00a0\nThere is a new section of QTM 302W section 3 open and there is only 15 seats.\u00a0 The course (class #6445) is on MW at 1:00 PM -2:15 PM taught by Dr. Ben Miller.\u00a0\u00a0 This course is permission code only and prefer graduating senior (who need this course to graduate).\u00a0\u00a0 This is a first come, first serve situation. \u00a0If there are seats left, we will take juniors.\u00a0 \u00a0\n\u00a0\nNew section \u00a0\nQTM 151- section 2 -\u00a0 \u00a0Introduction to Statistical Computing II - MW 4pm-4:50pm\u00a0in\u00a0Anthropology Building 303- have plenty of seats (103) \u2013 3 credit hour\nQTM 350- section 2\u00a0 - Data Science Computing - MW 2:30pm-3:45pm\u00a0in\u00a0Math & Science Center - E208\u00a0 - have plenty of seats (41) \u2013 3 credit hours\n\u00a0\nQTM 185 \u2013 Section 1 - Applied Topics in QTM: Data Science for Social Good \u2013 Fridays - 1pm-1:50pm\u00a0in ONLINE\u00a0 - 1 credit hour\nQTM 185 \u2013 section 2 - \u00a0Applied Topics in QTM: Generative AI/Real-World Appl\u00a0 - M 6pm-8pm\u00a0in\u00a0New Psyc Bldg 250 (36 Eagle Row) \u2013 2 credit hours \n\u00a0\nNew course\nQTM 185 \u2013 section 3 - Applied Topics in QTM: Ethical Emerging Technologies - Th 5pm-7pm\u00a0in\u00a0New Psyc Bldg 220 (36 Eagle Row) \u2013 2 credit hours\n\u00a0\nNew course\nQTM 285 \u2013 section 1 - Topics in Quantitative Science: Prediction, Inference & Causality - MW 4pm-5:15pm\u00a0in\u00a0New Psyc Bldg 230 (36 Eagle Row)\nLab Friday 2:30pm-3:20pm\u00a0in\u00a0New Psyc Bldg 220 (36 Eagle Row) This class will be accepted in place of QTM 220 as a prerequisite.\u00a0\u00a0 \u2013 4 credit hours\n\u00a0\nNew course\nQTM 285 \u00a0- section 2 - Topics in Quantitative Science: Unlocking the Future with AI \u2013 Wednesdays 4pm-7pm\u00a0in ONLINE \u2013 3 credit hours\n\u00a0\nBest, \n\u00a0\nSadie Hannans\nUndergraduate Program Coordinator\nDepartment of Quantitative Theory & Methods\nEmory University\nEmail: shanna9@emory.edu | 470-620-7981\n(she/her/hers)"
    
    correct keywords: administration
    
    
    Example 6:
    
    "content": "SPECIAL ANNOUNCEMENT\n\u00a0\n\u00a0\nAnnouncement from summer programs: \n\u00a0\nNow is a good time to plan the remainder of your academic year. We are offering a\u00a0Maymester course\u00a0this summer. It is a three-week intensive course that may fulfill a GER or graduation requirement.\u00a0\n\u00a0\nSummer School\u00a0will be having an\u00a0Info Stop\u00a0to provide more information about Maymester course offerings. Be sure to stop by their table to learn more along with summer and semester options through\u00a0Education Abroad.\n\u00a0\nDates & deadlines\nWed, Jan 31: Maymester & Study Abroad Info Stop -- 11:00a-1:30p, ESC South Commons\nTue, Feb 13: Summer enrollment opens\nThur, Feb 15: Application deadline for most summer abroad programs\n\u00a0\nFor questions, please send an email to\u00a0Summer Programs\u00a0or\u00a0Education Abroad.\n\u00a0\nBest, \n\u00a0\n\u00a0\nSadie Hannans\nUndergraduate Program Coordinator\nDepartment of Quantitative Theory & Methods\nEmory University\nEmail: shanna9@emory.edu | 470-620-7981\n(she/her/hers)"
    
    correct keywords: administration
    
    Example 7:
    
    "content": "SPECIAL ANNOUNCEMENT\n\u00a0\nDr. Kevin McAlister(QTM Director of Research) and Dr. Jin Kim (QTM Director of Undergraduate Studies) will be conducting an advising session for all QTM Majors and Minors on Friday, January 19th at 12:30 PM \u2013 2:00PM at the Psychology Building in PAIS 290 Auditorium. This is a very important meeting for all QTM Students(QSS, AMS, PPA, & QSS Minors)regarding course requirements for your major or minor. Seniors, this is especially important for you to ensure you are taking the correct courses to complete your degree as add/drop/swap ends on January 30th.\n\u00a0\nThose of you who have questions regarding the capstone program, honors program, Internships, overlap courses, course substitutes, etc., please attend as well. \n\u00a0\nIf you have questions regarding this meeting, please contact me. Thanks. \n\u00a0\nBest, \u00a0\n\u00a0\nSadie Hannans\nUndergraduate Program Coordinator\nDepartment of Quantitative Theory & Methods\nEmory University\nEmail: shanna9@emory.edu | 470-620-7981\n(she/her/hers)"
    
    correct keywords: events
    
    Example 8:
    
    "content": "For QTM Undergraduates\n\nWednesday, \nNovember 22,\u00a0 2023Add/Drop/Swap starts \u2013 Monday, November 20th nOPPORTUNITY\u00a0\nSpring 2024 QTM Undergraduate Teaching & Mentorship Application\n\u00a0\nThe Spring 2024 QTM Undergraduate Teaching & Mentorship Application is now open. We are seeking TAs, Lab Assistants, and Graders for QTM 100, 110, 150, 151, 200, 210, 220, and 350. Please complete the application at https://forms.office.com/r/vR2YxeY4gp. You must be signed in with your Emory student credentials to access the application form. If you have questions about spring positions or the application form, please contact Lora McDonald (lora.mcdonald@emory.edu). \n\u00a0\nSee attached for more details \n\u00a0\nSubmitted by Lora McDonald\\nOpportunityTHE MARCUS AUTISM CENTER\nThe Marcus Autism Center, in conjunction with the Emory University School of Medicine and Children's Healthcare of Atlanta, is offering five fellowships: the Cohen Fellowship in Developmental Social Neuroscience, the Simons Fellowship in Computational Neuroscience, the Louise and Brett Samsky Fellowship in Educational Science and Practice, the Sally Provence Fellowship in Clinical Research, and the ACCESS Fellowship in Implementation Science. \nStudents who will receive a bachelor's degree by June 2024 will be eligible for the positions. The fellowships will commence in July 2024, and they are 2 years in duration. Students can find further details here. \nAttached, please find a brochure describing the fellowships. Please feel free to print the brochure and post it in your department. I ask that you let us know that you have received this e-mail and that you forward it, along with the associated brochure, to students in the Department of Quantitative Theory and Methods. \nThe Cohen Fellowship in Developmental Social Neuroscience will involve cutting-edge social neuroscience and/or neuroimaging research in infants, toddlers and adolescents. Fellows will work to further the understanding of autism through eye-tracking research, guiding a project from the point of data collection to publication of results. \nThe Simons Fellowship in Computational Neuroscience will involve integrating computational strategies with clinical research goals. Fellows will develop methods for the analysis of visual scanning and eye-tracking data, computational models of visual salience, and data visualization techniques, all with the aim of advancing the understanding of autism and efforts at early diagnosis. \nThe Louise and Brett Samsky Fellowship in Educational Science and Practice will involve research in educational innovations in autism. Fellows will learn about classroom-based interventions to increase social emotional engagement and inclusion, gaining experiences with observational research methods, practical experience through direct classroom responsibilities, cutting edge intervention research, and implementation science approaches. \nThe Sally Provence Fellowship in Clinical Research will select fellows for a two-year training in clinical assessment measures and research methodologies to better understand ASD and related disabilities. \nThe ACCESS Fellowship in Implementation Science will select fellows for a two-year training in research focused on community engagement participatory methods, translating evidence-based treatments for autism into community settings, as well as the processes and partnerships that support these efforts. \nThank you for your help! We look forward to hearing from you. \nSincerely, Marcus Predoctoral Fellowship Committee See attached flyer\n\nPrerequisites\nYou must complete ALL prerequisites listed in order to enroll in these courses.\nQTM 210\n\t\u2022\tEither QTM 120 or MATH 210 or MATH 211\nQTM 220\n\t\u2022\tQTM 110\n\t\u2022\tQTM 150\n\t\u2022\tQTM 210 or ECON 220 or MATH 361 with a plan to co-enroll in MATH 362\n\t\u2022\tMATH 210 or MATH 211\n\t\u2022\tMATH 221\nElective Prerequisites\n\t\u2022\tQTM 385 prerequisites will be listed in the course description!!!\n\u00a0\nEquivalents\nBelow are courses that act as equivalents to the respective QTM courses\nQTM 210:\n\t\u2022\tECON 220 and [MATH 210 or MATH 211]\n\t\u2022\tOR - MATH 362\nQTM 220:\n\t\u2022\tECON 320 plus QTM 110, QTM 150, MATH 221 and [MATH 210 or MATH 211]\n\u00a0\nFor General Academic Advising\nSchedule an appointment at\u00a0https://calendly.com/emoryqtm.\n\u00a0\nIf you have any questions regarding the above information, please do not hesitate to reach out.\n\u00a0\nBest,\nSadie Hannans\nQTM Program Coordinator\nShanna9@emory.edu\u00a0\u00a0 470-620-7981\n\n\n\u00a0\n\n\u00a0\n\u00a0\n\n\u00a0\u00a0\u00a0\n\u00a0\n\u00a0\u00a0\u00a0\u00a0\u00a0\n\n\u00a0\u00a0\u00a0\n\n\n\n\nPROFESSIONAL DEVELOPMENT\n\nQTM Preparation: A Pathway to Professionalism\n\u00a0\nNew Resources Available\n\n\u00a0\n\u00a0Getting Started with SQL\n\n\u00a0\nGit and GitHub\n\u00a0\n\n\u00a0\n\u00a0\n\u00a0\n\u00a0\n\nQTM DataCamp Access\nQTM Access to DataCamp is now back in business! Sign up here.\nConnect with Emory Alumni\n\nEmory Connects is a platform sponsored by Emory Alumni and Engagement and is a space for current students and alumni to connect and network. Log in to\u00a0Emory Connects\u00a0today to see what is available!\n\u00a0\n\nQTM DataCamp Access\n\nQTM Access to DataCamp is now back in business! Sign up here.\n\u00a0\n\u00a0\n\n\u00a0\nLooking for a spring or summer internship or full-time job?\n\nCheck out the attached document for tips on internship/job searches. Positions are still being posted on Handshake and many are still accepting applications.\u00a0Log on to Handshake\u00a0to check out some of the roles!\n\n\u00a0\u00a0\n\u00a0\n\u00a0\nSadie Hannans\nUndergraduate Program Coordinator\nDepartment of Quantitative Theory & Methods\nEmory University\nEmail: shanna9@emory.edu | 470-620-7981\n(she/her/hers)"
    
    correct keywords: careers


"""

In [41]:
def evaluate_email_classification(all_qtm_emails, output):
    """
    Evaluate email classification performance with comprehensive metrics
    for multi-label (first half) and single-label (second half) data.
    
    Parameters:
    - all_qtm_emails: List of email data with ground truth labels in 'category' field
    - output: List of model prediction data with 'predicted_classification' field
    
    Returns:
    - Dictionary containing all metrics for different segments of the dataset
    """
    # Get total length of dataset
    length = len(all_qtm_emails)
    first_half_end = 65  # Index where first half ends
    
    # Define metrics calculation functions
    def calculate_jaccard(predicted_keywords, true_keywords):
        pred_set = set(predicted_keywords)
        true_set = set(true_keywords)
        
        intersection = len(pred_set & true_set)
        union = len(pred_set | true_set)
        
        return intersection / union if union > 0 else 1.0  # If both sets empty, similarity is 1
    
    def calculate_precision(predicted_keywords, true_keywords):
        pred_set = set(predicted_keywords)
        true_set = set(true_keywords)
        
        intersection = len(pred_set & true_set)
        
        return intersection / len(pred_set) if len(pred_set) > 0 else 1.0  # If no predictions, precision is 1
    
    def calculate_recall(predicted_keywords, true_keywords):
        pred_set = set(predicted_keywords)
        true_set = set(true_keywords)
        
        intersection = len(pred_set & true_set)
        
        return intersection / len(true_set) if len(true_set) > 0 else 1.0  # If no true labels, recall is 1
    
    def calculate_f1(predicted_keywords, true_keywords):
        precision = calculate_precision(predicted_keywords, true_keywords)
        recall = calculate_recall(predicted_keywords, true_keywords)
        
        return 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    def calculate_accuracy(predicted_keywords, true_keywords):
        """For single-label classification, checks if the prediction contains the true label"""
        return 1.0 if set(true_keywords) & set(predicted_keywords) else 0.0
    
    # Calculate per-example metrics for different segments
    def calculate_segment_metrics(start_idx, end_idx):
        total_precision = 0
        total_recall = 0
        total_f1 = 0
        total_jaccard = 0
        total_accuracy = 0  # Only meaningful for single-label
        
        for i in range(start_idx, end_idx):
            predicted = output[i]["predicted_classification"]["relevant_keywords"]
            true = all_qtm_emails[i]["category"]
            
            total_precision += calculate_precision(predicted, true)
            total_recall += calculate_recall(predicted, true)
            total_f1 += calculate_f1(predicted, true)
            total_jaccard += calculate_jaccard(predicted, true)
            total_accuracy += calculate_accuracy(predicted, true)
        
        num_examples = end_idx - start_idx
        return {
            "precision": total_precision / num_examples if num_examples > 0 else 0,
            "recall": total_recall / num_examples if num_examples > 0 else 0,
            "f1": total_f1 / num_examples if num_examples > 0 else 0,
            "jaccard": total_jaccard / num_examples if num_examples > 0 else 0,
            "accuracy": total_accuracy / num_examples if num_examples > 0 else 0
        }
    
    # Calculate global metrics for second half (single-label)
    def calculate_global_metrics_second_half():
        total_tp, total_fp, total_fn = 0, 0, 0
        total_correct = 0
        
        for i in range(first_half_end, length):
            predicted = set(output[i]["predicted_classification"]["relevant_keywords"])
            true = set(all_qtm_emails[i]["category"])
            
            # Calculate TP, FP, FN for global metrics
            tp = len(predicted & true)
            fp = len(predicted - true)
            fn = len(true - predicted)
            
            total_tp += tp
            total_fp += fp
            total_fn += fn
            
            # For accuracy
            if tp > 0:
                total_correct += 1
        
        # Calculate global metrics
        precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
        recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        accuracy = total_correct / (length - first_half_end) if (length - first_half_end) > 0 else 0
        
        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "accuracy": accuracy
        }
    
    # Calculate confusion matrix for second half (single-label)
    def calculate_confusion_matrix():
        # Get all unique categories
        all_categories = set()
        for email in all_qtm_emails:
            all_categories.update(email["category"])
        all_categories = sorted(list(all_categories))
        
        # Initialize confusion matrix
        confusion_matrix = {category: {pred: 0 for pred in all_categories} for category in all_categories}
        
        # Fill confusion matrix
        for i in range(first_half_end, length):
            true_category = all_qtm_emails[i]["category"][0]  # Assuming single label
            
            # Find predicted category
            predicted = output[i]["predicted_classification"]["relevant_keywords"]
            predicted_category = predicted[0] if predicted else None  # Take first prediction
            
            # Update confusion matrix
            if predicted_category in all_categories:
                confusion_matrix[true_category][predicted_category] += 1
        
        return confusion_matrix
    
    # Calculate all metrics
    first_half_metrics = calculate_segment_metrics(0, first_half_end)
    second_half_metrics = calculate_segment_metrics(first_half_end, length)
    whole_dataset_metrics = calculate_segment_metrics(0, length)
    second_half_global_metrics = calculate_global_metrics_second_half()
    confusion_matrix = calculate_confusion_matrix()
    
    # Compile results
    results = {
        "first_half": first_half_metrics,
        "second_half": {
            "per_example": second_half_metrics,
            "global": second_half_global_metrics
        },
        "whole_dataset": whole_dataset_metrics,
        "confusion_matrix": confusion_matrix
    }
    
    return results

# Example usage
def print_evaluation_report(all_qtm_emails, output):
    """
    Generate and print a comprehensive evaluation report
    """
    results = evaluate_email_classification(all_qtm_emails, output)
    
    print("=" * 80)
    print("EMAIL CLASSIFICATION EVALUATION REPORT")
    print("=" * 80)
    
    print("\nPER-EXAMPLE METRICS:")
    print("-" * 80)
    print(f"{'Metric':<15} {'First Half':<15} {'Second Half':<15} {'Whole Dataset':<15}")
    print("-" * 80)
    metrics = ["precision", "recall", "f1", "jaccard", "accuracy"]
    for metric in metrics:
        first_half_val = results["first_half"].get(metric, "N/A")
        second_half_val = results["second_half"]["per_example"].get(metric, "N/A")
        whole_val = results["whole_dataset"].get(metric, "N/A")
        
        if first_half_val != "N/A":
            first_half_val = f"{first_half_val:.2f}"
        if second_half_val != "N/A":
            second_half_val = f"{second_half_val:.2f}"
        if whole_val != "N/A":
            whole_val = f"{whole_val:.2f}"
        
        print(f"{metric.capitalize():<15} {first_half_val:<15} {second_half_val:<15} {whole_val:<15}")
    
    print("\nGLOBAL METRICS FOR SECOND HALF:")
    print("-" * 80)
    for metric, value in results["second_half"]["global"].items():
        print(f"{metric.capitalize():<15} {value:.2f}")
    
    print("\nCONFUSION MATRIX SUMMARY (TOP MISCLASSIFICATIONS):")
    print("-" * 80)
    # Find top confused category pairs
    top_confusions = []
    for true_cat, predictions in results["confusion_matrix"].items():
        for pred_cat, count in predictions.items():
            if true_cat != pred_cat and count > 0:
                top_confusions.append((true_cat, pred_cat, count))
    
    # Sort by count descending and print top 5
    top_confusions.sort(key=lambda x: x[2], reverse=True)
    for true_cat, pred_cat, count in top_confusions[:5]:
        print(f"True: {true_cat}, Predicted: {pred_cat}, Count: {count}")
    
    print("=" * 80)


with open('/Users/natehu/Desktop/QTM 329 Comp Ling/EmaiLLM/qtm_output_threeshot.json', 'r') as f:
    output = json.load(f)
# Example call
print_evaluation_report(all_qtm_emails, output)

EMAIL CLASSIFICATION EVALUATION REPORT

PER-EXAMPLE METRICS:
--------------------------------------------------------------------------------
Metric          First Half      Second Half     Whole Dataset  
--------------------------------------------------------------------------------
Precision       0.58            0.23            0.41           
Recall          0.62            0.49            0.55           
F1              0.58            0.30            0.44           
Jaccard         0.50            0.23            0.36           
Accuracy        0.72            0.49            0.61           

GLOBAL METRICS FOR SECOND HALF:
--------------------------------------------------------------------------------
Precision       0.22
Recall          0.49
F1              0.30
Accuracy        0.49

CONFUSION MATRIX SUMMARY (TOP MISCLASSIFICATIONS):
--------------------------------------------------------------------------------
True: administration, Predicted: academics, Count: 11
True: ev

zeroshot: pure zero-shot

threeshot: pure 3-shot

0shot : Zero-shot + keyword count constraint

5shot : 8 shot + keyword count constraint
