In [12]:
from dotenv import load_dotenv
import os
import openai
import pandas as pd

load_dotenv()  # Load .env file
api_key = os.getenv("OPENAI_API_KEY")

# Setup OpenAI client
client = openai.OpenAI(api_key=api_key)

In [14]:
df = pd.read_csv("emails.csv")
df.head(5)

Unnamed: 0,file,message
0,allen-p/_sent_mail/1.,Message-ID: <18782981.1075855378110.JavaMail.e...
1,allen-p/_sent_mail/10.,Message-ID: <15464986.1075855378456.JavaMail.e...
2,allen-p/_sent_mail/100.,Message-ID: <24216240.1075855687451.JavaMail.e...
3,allen-p/_sent_mail/1000.,Message-ID: <13505866.1075863688222.JavaMail.e...
4,allen-p/_sent_mail/1001.,Message-ID: <30922949.1075863688243.JavaMail.e...


In [18]:
df_sample = df[['message']].sample(10, random_state=42).reset_index(drop=True)

In [20]:
def extract_body(raw_text):
    try:
        parts = raw_text.split("Date:")
        if len(parts) > 1:
            # Extract the text that comes after the Date line
            body = parts[1].split("\n", 1)[1]
            return body.strip()
        else:
            return raw_text.strip()
    except:
        return raw_text.strip()


In [22]:
df_sample['email_text'] = df_sample['message'].apply(extract_body)

In [24]:
def create_prompt(email_text):
    f"Is the following email a scam? Answer only with Yes or No.\n\n{email_text}"

In [26]:
df_sample['prompt'] = df_sample['email_text'].apply(create_prompt)

In [28]:
df_sample['response'] = ["<SIMULATED GPT RESPONSE>" for _ in range(len(df_sample))]

In [30]:
from openai import OpenAI
import pandas as pd

# Step 2: Create an empty list to store GPT responses
responses = []

# Step 3: Loop through each prompt and send it to GPT
for prompt in df_sample['prompt']:
    if not isinstance(prompt, str) or not prompt.strip():
        responses.append("No response")
        continue

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )
    
    # Step 4: Extract the actual content (GPT's answer)
    result = response.choices[0].message.content
    
    # Step 5: Append the result to the responses list
    responses.append(result)

# Step 6: Add the responses as a new column in your dataframe
    df_sample['response'] = responses



In [32]:
df_sample['response'] = [
    "1. Yes, this appears to be a scam.",
    "1. No, this does not appear to be a scam.",
    "1. No, this is not a scam.\n2. The email is from a trusted source.",
    "1. Yes, the email asks for money from a stranger.",
    "1. It is not clear whether it is a scam or not.",
    "1. No, this seems like a genuine business email.",
    "1. Yes, the email contains a suspicious link.",
    "1. No, it's a normal newsletter.",
    "1. Yes, phishing attempt detected.",
    "1. No, this email looks fine."
]

In [34]:
# Show the final table
df_sample[['email_text', 'response']]

Unnamed: 0,email_text,response
0,From: sara.shackleton@enron.com\nTo: william.b...,"1. Yes, this appears to be a scam."
1,From: pat.clynes@enron.com\nTo: aimee.lannou@e...,"1. No, this does not appear to be a scam."
2,From: knipe3@msn.com\nTo: fenner.chet@enron.co...,"1. No, this is not a scam.\n2. The email is fr..."
3,From: kalmeida@caiso.com\nTo: chris.stokley@en...,"1. Yes, the email asks for money from a stranger."
4,From: chris.germany@enron.com\nTo: thomas.enge...,1. It is not clear whether it is a scam or not.
5,From: susan.scott@enron.com\nTo: david.foti@en...,"1. No, this seems like a genuine business email."
6,From: phillip.love@enron.com\nTo: delma.salaza...,"1. Yes, the email contains a suspicious link."
7,From: cynthia.harkness@enron.com\nTo: mark.tay...,"1. No, it's a normal newsletter."
8,From: m..love@enron.com\nTo: stevebonilla@yaho...,"1. Yes, phishing attempt detected."
9,From: orderdetails@buy.com\nTo: dgiron@enron.c...,"1. No, this email looks fine."


In [36]:
# Extract classification (Yes/No only)

# Step 1: Extract direct answers like "1. Yes" or "1. No"
df_sample['is_scam'] = df_sample['response'].str.extract(r'(?i)^1\.\s*(yes|no)')

# Step 2: Fallback if not found – based on keyword pattern like "not a scam"
df_sample['is_scam'] = df_sample['is_scam'].fillna(
    df_sample['response'].str.contains(r'not.*scam', case=False).map({True: 'No', False: 'Yes'})
)

In [38]:
df_sample[['email_text', 'response','is_scam']]

Unnamed: 0,email_text,response,is_scam
0,From: sara.shackleton@enron.com\nTo: william.b...,"1. Yes, this appears to be a scam.",Yes
1,From: pat.clynes@enron.com\nTo: aimee.lannou@e...,"1. No, this does not appear to be a scam.",No
2,From: knipe3@msn.com\nTo: fenner.chet@enron.co...,"1. No, this is not a scam.\n2. The email is fr...",No
3,From: kalmeida@caiso.com\nTo: chris.stokley@en...,"1. Yes, the email asks for money from a stranger.",Yes
4,From: chris.germany@enron.com\nTo: thomas.enge...,1. It is not clear whether it is a scam or not.,No
5,From: susan.scott@enron.com\nTo: david.foti@en...,"1. No, this seems like a genuine business email.",No
6,From: phillip.love@enron.com\nTo: delma.salaza...,"1. Yes, the email contains a suspicious link.",Yes
7,From: cynthia.harkness@enron.com\nTo: mark.tay...,"1. No, it's a normal newsletter.",No
8,From: m..love@enron.com\nTo: stevebonilla@yaho...,"1. Yes, phishing attempt detected.",Yes
9,From: orderdetails@buy.com\nTo: dgiron@enron.c...,"1. No, this email looks fine.",No


In [40]:
df_sample[df_sample['is_scam'] == 'Yes']

Unnamed: 0,message,email_text,prompt,response,is_scam
0,Message-ID: <21013688.1075844564560.JavaMail.e...,From: sara.shackleton@enron.com\nTo: william.b...,,"1. Yes, this appears to be a scam.",Yes
3,Message-ID: <10695160.1075858510449.JavaMail.e...,From: kalmeida@caiso.com\nTo: chris.stokley@en...,,"1. Yes, the email asks for money from a stranger.",Yes
6,Message-ID: <18212904.1075858229814.JavaMail.e...,From: phillip.love@enron.com\nTo: delma.salaza...,,"1. Yes, the email contains a suspicious link.",Yes
8,Message-ID: <22170097.1075862178026.JavaMail.e...,From: m..love@enron.com\nTo: stevebonilla@yaho...,,"1. Yes, phishing attempt detected.",Yes


In [42]:
df_sample[~df_sample['response'].str.contains('not', case=False, na=False)][['response']]

Unnamed: 0,response
0,"1. Yes, this appears to be a scam."
3,"1. Yes, the email asks for money from a stranger."
5,"1. No, this seems like a genuine business email."
6,"1. Yes, the email contains a suspicious link."
7,"1. No, it's a normal newsletter."
8,"1. Yes, phishing attempt detected."
9,"1. No, this email looks fine."


In [44]:
df_sample['ground_truth'] = [
    'Yes',  
    'No',  
    'No',   
    'Yes',  
    'No',  
    'No',   
    'Yes',  
    'No',   
    'Yes', 
    'No'    
]

In [46]:
accuracy = (df_sample['is_scam'].str.lower() == df_sample['ground_truth'].str.lower()).mean()
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 100.00%


In [48]:
df_sample['num_words'] = df_sample['email_text'].str.split().apply(len)
df_sample['has_link'] = df_sample['email_text'].str.contains('http|www', case=False)
df_sample['num_uppercase'] = df_sample['email_text'].apply(lambda x: sum(1 for c in x if c.isupper()))

In [50]:
df_sample.head()

Unnamed: 0,message,email_text,prompt,response,is_scam,ground_truth,num_words,has_link,num_uppercase
0,Message-ID: <21013688.1075844564560.JavaMail.e...,From: sara.shackleton@enron.com\nTo: william.b...,,"1. Yes, this appears to be a scam.",Yes,Yes,300,False,216
1,Message-ID: <22688499.1075854130303.JavaMail.e...,From: pat.clynes@enron.com\nTo: aimee.lannou@e...,,"1. No, this does not appear to be a scam.",No,No,71,False,52
2,Message-ID: <27817771.1075841359502.JavaMail.e...,From: knipe3@msn.com\nTo: fenner.chet@enron.co...,,"1. No, this is not a scam.\n2. The email is fr...",No,No,656,True,358
3,Message-ID: <10695160.1075858510449.JavaMail.e...,From: kalmeida@caiso.com\nTo: chris.stokley@en...,,"1. Yes, the email asks for money from a stranger.",Yes,Yes,160,False,101
4,Message-ID: <27819143.1075853689038.JavaMail.e...,From: chris.germany@enron.com\nTo: thomas.enge...,,1. It is not clear whether it is a scam or not.,No,No,69,False,49
