# Text Classification with GPT-4o
## ABB #1 - Session 3

Code authored by: Shaw Talebi

### imports

In [1]:
import pandas as pd

from openai import OpenAI
from top_secret import my_sk

In [2]:
# setup api client
client = OpenAI(api_key=my_sk)

### functions

In [3]:
def manual_feature_engineering(df):
    """
        Generate a suite of manually defined features
    """
    
    # length of the body
    df["body_length"] = df["body"].apply(len)
    
    # contains "lol"
    df["contains_lol"] = df["body"].apply(lambda x: "lol" in x.lower())
    
    # contains "OMG"
    df["contains_omg"] = df["body"].apply(lambda x: "omg" in x.lower())
    
    # contains "attached" or "see attached" or "see attachment"
    df["contains_attached"] = df["body"].apply(lambda x: "attached" in x.lower())
    df["contains_attachment"] = df["body"].apply(lambda x: "attachment" in x.lower())
    
    # contains "Order Confirmation"
    df["contains_order_confirmation"] = df["body"].apply(lambda x: "order confirmation" in x.lower())
    
    # contains "payment summary"
    df["contains_payment_summary"] = df["body"].apply(lambda x: "payment summary" in x.lower())
    
    # sender is common person domain (gmail, yahoo, hotmail) 
    df["sender_has_common_domain"] = df["from"].apply(lambda x: any(domain in x.lower() for domain in ["gmail", "yahoo", "hotmail"]))
    
    # is personal email
    df["is_personal"] = df["label"] == "personal"

    return df

In [4]:
def generate_label(prompt_template, subject, sender, body):
    """
        Function to generate 0-shot label for email based on subject, sender, and body
    """
    prompt = prompt_template(subject, sender, body)
    
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Email Classifer"},
            {"role": "user", "content": prompt}
        ], 
        temperature = 0.25,
        max_completion_tokens=25,
    )
    
    # extract response
    return response.choices[0].message.content

### Load Data

In [5]:
df = pd.read_csv("data/emails.csv").astype(str)

### Feature Engineering (Manual)

In [6]:
df = manual_feature_engineering(df)
df.head()

Unnamed: 0,subject,from,body,label,body_length,contains_lol,contains_omg,contains_attached,contains_attachment,contains_order_confirmation,contains_payment_summary,sender_has_common_domain,is_personal
0,Thank you Shawhin for your RSVP,Evite <info@mailva.evite.com>,Thank you for your RSVP\n\n\nYou replied Yes f...,personal,1830,False,False,False,False,False,False,False,True
1,Find Date for Knocking at Door,ifyahuna@gmail.com,So you're gonna talk to your dad about this ri...,personal,1522,False,False,False,False,False,False,True,True
2,Thank you Shawhin for your RSVP,Evite <info@mailva.evite.com>,Thank you for your RSVP\n\n\nYou replied Yes f...,personal,1838,False,False,False,False,False,False,False,True
3,"Folder shared with you: ""Knocking at The Door""","""Ifeoma Ahuna (via Google Drive)"" <drive-share...",I've shared an item with you:\r\n\r\nKnocking ...,personal,251,False,False,False,True,False,False,False,True
4,The Colony Shoreline Trail 5K and 15K Registra...,RunSignup <info+auto@runsignup.com>,[1]The Colony Shoreline Trail 5K and 15K\r\n\r...,personal,3170,False,False,False,False,False,False,False,True


### Text Classification with GPT-4o

#### 0-shot

In [7]:
# prompt
prompt_template = lambda subject, sender, body : f"""You are an intelligent assistant that classifies emails based on whether they are personal or not. \
Given an email's subject, sender, and body, determine if the email is personal (indicated by 1) or not personal (indicated by 0). A personal email typically \
includes messages from friends, family, or individuals addressing personal topics. Non-personal emails include promotional content, work-related messages, \
newsletters, or automated notifications.

Input Email:
Subject: {subject}
Sender: {sender}
Body: {body}

Instructions:
Carefully analyze the subject, sender, and body to understand the context and tone of the email.
Return:
1 if the email is personal.
0 if the email is not personal.

Output: [Your classification: 1 or 0]
"""

In [8]:
%%time
# intialize list to store labels
label_0shot_list = []

# generate labels for each row
for index, row in df.iterrows():
    label_0shot_list.append(generate_label(prompt_template, row['subject'], row['from'], row['body']))

CPU times: user 3.56 s, sys: 176 ms, total: 3.74 s
Wall time: 2min 38s


In [9]:
print(label_0shot_list)

['0', '1', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '0', '0', '0', '0', '1', '0', '0', '1', '1', '0', '0', '0', '0', '1', '0', '0', '0', '0', '[Your classification: 0]', '1', '0', '0', '0', '[Your classification: 1]', '1', '1', '0', '0', '0', '0', '1', '1', '0', '0', '0', '0', '0', '0', '1', 'Output: 1', '1', '0', '0', '1', '1', '1', '1', '1', '0', '0', '0', '1', '0', '0', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', 

In [10]:
# add label to df
df['label_0shot'] = ["1" in label for label in label_0shot_list]
df['correct_0shot'] = df['is_personal']==df['label_0shot']

In [11]:
# compare ground truth to 0-shot label
print(df['correct_0shot'].sum()/len(df))

0.63


#### few-shot

In [12]:
# prompt
prompt_template_fewshot = lambda subject, sender, body :f"""You are an intelligent assistant that classifies emails as personal or not personal. Given an email's subject, sender, and body, determine if the email is personal (indicated by 1) or not personal (indicated by 0). A personal email typically includes messages from friends, family, or individuals addressing personal topics. Non-personal emails include promotional content, work-related messages, newsletters, or automated notifications.

##Instructions:
Carefully analyze the subject, sender, and body to understand the context and tone of the email.
Return:
1 if the email is personal.
0 if the email is not personal.

##Examples:
**Example 1:**

Subject: {df['subject'][0]}
Sender: {df['from'][0]}
Body: {df['body'][0]}
Output: {int(df['is_personal'][0])}

**Example 2:**

Subject: {df['subject'][1]}
Sender: {df['from'][1]}
Body: {df['body'][1]}
Output: {int(df['is_personal'][1])}

**Example 3:**

Subject: {df['subject'][150]}
Sender: {df['from'][150]}
Body: {df['body'][150]}
Output: {int(df['is_personal'][150])}

**Input Email:**

Subject: {subject}
Sender: {sender}
Body: {body}
Output: [Your classification: 1 or 0]
"""

In [13]:
%%time
# intialize list to store labels
label_fewshot_list = []

# generate labels for each row
for index, row in df.iterrows():
    label_fewshot_list.append(generate_label(prompt_template_fewshot, row['subject'], row['from'], row['body']))

CPU times: user 4.87 s, sys: 251 ms, total: 5.12 s
Wall time: 4min 21s


In [14]:
print(label_fewshot_list)

['0', '1', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '1', '1', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '1', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '0', '0', '0', '0', '1', '1', '0', '1', '1', '1', '1', '1', '1', '1', '1', '0', '0', '1', '1', '1', '1', '0', '1', '1', '1', '1', '0', '0', '0', '1', '0', '0', '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '1', '1', '1', '0', '0', '1', '1', '1', '1', '1', '1', '0', '1', '1', '1', '0', '1', '1', '1', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '1', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',

In [15]:
# add label to df
df['label_fewshot'] = ["1" in label for label in label_fewshot_list]
df['correct_fewshot'] = df['is_personal']==df['label_fewshot']

In [16]:
# compare ground truth to 0-shot label
print(df['correct_fewshot'].sum()/len(df))

0.7433333333333333


In [17]:
# save data to file
df_transformed = df.iloc[:, 4:]
df_transformed.to_csv("data/transformed_data.csv", index=False)

**Bonus:** train a classifer using noisy labels from GPT-4o-mini using [Example 2](https://github.com/ShawhinT/AI-Builders-Bootcamp-1/blob/main/session-2/example_2-email_classifier.ipynb) from Session 2