# Script for phishing email dataset processing and style label extraction

## Prerequisites setup

Put the downloaded dataset in `data_phish/raw/` folder.

For the example, we use the [Phishing Email Dataset](https://figshare.com/articles/dataset/Curated_Dataset_-_Phishing_Email/24899952) from figshare.

In [1]:
!ls ../../data_phish/raw/ | grep .csv

CEAS_08.csv
Enron.csv
Ling.csv
SpamAssasin.csv
TREC_05.csv
TREC_06.csv
TREC_07.csv


In [None]:
import csv
import json
import os
import random
import re
import sys
from pathlib import Path

import pandas as pd

In [10]:
csv.field_size_limit(
    10**7
)  # Increase CSV field size limit to avoid errors on large email bodies
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
print(root_path)
RAW_DATA_DIRECTORY = Path(root_path) / "data_phish" / "raw"
EXTRACTED_DATA_DIRECTORY = Path(root_path) / "data_phish" / "jsonl"

PROCESS_FLAG = False  # Set to True to process all datasets

/home/yuhao/workspace/data-sythesis-research


## Define label extraction function

Here we select two dimensions of style labels: (1) Style/Tone, (2) Purpose.

In [5]:
def label_email(text: str) -> dict:
    """Label the email text with two attributes: style_tone and purpose."""

    text_lower = text.lower()
    labels = {"style_tone": [], "purpose": []}

    # ---- 1. Style / Tone ----
    if re.search(r"\burgent\b|\bimmediately\b|asap|within \d+ hours", text_lower):
        labels["style_tone"].append("urgent")
    elif re.search(r"\bdear\b|sincerely|regards", text_lower):
        labels["style_tone"].append("formal")
    elif re.search(r"\bhey\b|\bhi\b|\bthanks\b", text_lower):
        labels["style_tone"].append("informal")
    elif re.search(r"sale|discount|offer|promotion", text_lower):
        labels["style_tone"].append("marketing")
    else:
        labels["style_tone"].append("other")

    # ---- 2. Purpose ----
    if re.search(r"password|verify|account|login", text_lower):
        labels["purpose"].append("account")
    elif re.search(r"sale|offer|discount|buy now", text_lower):
        labels["purpose"].append("advertisement")
    elif re.search(r"meeting|report|project|schedule", text_lower):
        labels["purpose"].append("business")
    else:
        labels["purpose"].append("other")

    return labels

## Process the dataset and extract style labels

In [6]:
def process_dataset(dataset_file: Path) -> None:
    """Process raw email texts and label them. Store the result in a JSON file."""
    df = pd.read_csv(dataset_file, engine="python")
    if (
        "subject" not in df.columns
        or "body" not in df.columns
        or "label" not in df.columns
    ):
        raise ValueError(
            f"Dataset {dataset_file} must contain 'subject', 'body', and 'label' columns."
        )
    df["phish"] = df["label"]
    df["text"] = df["subject"].fillna("") + "\n\n" + df["body"].fillna("")
    df["labels"] = df["text"].apply(label_email)
    df["id"] = df.index
    df = df[["id", "text", "labels", "phish"]]
    output_file = dataset_file.with_suffix(".json")
    df.to_json(output_file, orient="records", lines=True)
    print(f"Processed {dataset_file}, saved to {output_file}")


raw_data_dir = RAW_DATA_DIRECTORY.absolute()
datasets = [f for f in raw_data_dir.iterdir() if f.suffix == ".csv"]
if PROCESS_FLAG:
    for dataset in datasets:
        json_file = dataset.with_suffix(".json")
        if json_file.exists():
            print(f"Skipped {dataset}, {json_file} already exists.")
            continue
        process_dataset(dataset)
else:
    print("PROCESS_FLAG is set to False. No datasets were processed.")

PROCESS_FLAG is set to False. No datasets were processed.


## Examine the processed dataset

In [None]:
for dataset_file in datasets:
    json_file = dataset_file.with_suffix(".json")
    with open(json_file, "r", encoding="utf-8") as f:
        phish_count = 0
        benign_count = 0
        total = 0
        style_count = 0
        for line in f:
            entry = json.loads(line)
            total += 1
            if entry["phish"]:
                phish_count += 1
            else:
                benign_count += 1
            style_tone = entry.get("labels", {}).get("style_tone", [])
            if style_tone and any(s != "other" for s in style_tone):
                style_count += 1
        style_ratio = style_count / total if total else 0
        print(
            f"{json_file.name:<20} total={total:<6} phish={phish_count:<6} benign={benign_count:<6} phish_ratio={phish_count/total:>6.2%} style={style_count:<6} style_ratio={style_ratio:>6.2%}"
        )

Enron.json           total=29767  phish=13976  benign=15791  phish_ratio=46.95% style=15290  style_ratio=51.37%
SpamAssasin.json     total=5809   phish=1718   benign=4091   phish_ratio=29.57% style=2029   style_ratio=34.93%
TREC_07.json         total=53757  phish=29399  benign=24358  phish_ratio=54.69% style=25066  style_ratio=46.63%
Ling.json            total=2859   phish=458    benign=2401   phish_ratio=16.02% style=1072   style_ratio=37.50%
TREC_05.json         total=55990  phish=22946  benign=33044  phish_ratio=40.98% style=26727  style_ratio=47.74%
TREC_06.json         total=16457  phish=3989   benign=12468  phish_ratio=24.24% style=8235   style_ratio=50.04%
CEAS_08.json         total=39154  phish=21842  benign=17312  phish_ratio=55.78% style=12449  style_ratio=31.79%


In [None]:
for dataset_file in datasets:
    json_file = dataset_file.with_suffix(".json")
    benign_samples = []
    phish_samples = []
    with open(json_file, "r", encoding="utf-8") as f:
        entries = [json.loads(line) for line in f]
        benign_entries = [e for e in entries if not e["phish"]]
        phish_entries = [e for e in entries if e["phish"]]
        benign_samples = random.sample(benign_entries, min(2, len(benign_entries)))
        phish_samples = random.sample(phish_entries, min(2, len(phish_entries)))
    print(f"\nDataset: {json_file.name}")
    print("Benign samples:")
    for sample in benign_samples:
        print(json.dumps(sample, ensure_ascii=False, indent=2))
    print("Phish samples:")
    for sample in phish_samples:
        print(json.dumps(sample, ensure_ascii=False, indent=2))


Dataset: Enron.json
Benign samples:
{
  "id": 8922,
  "text": "invites for australian energy risk 2000 july 17 - 18\n\ndear lucie ,\nwhen i agreed to speak at the above conference , it was agreed that enron\ncould bring another staff member to attend gratis . however , i noticed that\nenron is actually providing two speakers - dr vince kamainski and myself . it\nwould be appreciated if we could instead of sending two staff members to the\nsame seminar that enron sends one staff member to the aust energy seminar and\none staff member to the risk 2000 - sydney seminar in august 22 - 23 . the\nlength & pricing are similar for both the seminars .\nupon your reply , i will supply the names of the staff members , there is\nstrong internal competition to go .\nthank you , raymond\n715 pm 4 july\n\" lucie deathridge \" on 05 / 25 / 2000 09 : 30 : 25 am\nplease respond to \" lucie deathridge \"\nto :\ncc :\nsubject : australian energy risk 2000\nthank you for agreeing to speak at the australia

# Examine the extracted dataset.

In [None]:
if not EXTRACTED_DATA_DIRECTORY.exists():
    print(f"Extracted data directory {EXTRACTED_DATA_DIRECTORY} does not exist. Please make sure to run the data extraction step in LLM first.")
    sys.exit(1)

datasets = [f for f in EXTRACTED_DATA_DIRECTORY.iterdir() if f.suffix == ".json"]
num_datasets = len(datasets)
print(f"Total datasets extracted: {num_datasets}")

total_benign = 0
total_phish = 0

for dataset_file in datasets:
    with open(dataset_file, "r", encoding="utf-8") as f:
        benign_count = 0
        phish_count = 0
        for line in f:
            entry = json.loads(line)
            if entry["phish"] == 1:
                phish_count += 1
            elif entry["phish"] != 1:
                benign_count += 1
            else:
                raise ValueError(
                    f"Invalid label {entry['phish']} in dataset {dataset_file}"
                )
        total_benign += benign_count
        total_phish += phish_count
        print(
            f"{dataset_file.name:<30} benign={benign_count:<6} phish={phish_count:<6} phish_ratio={phish_count/(benign_count+phish_count):>6.2%}"
        )
print(
    f"\nOverall benign={total_benign:<6} phish={total_phish:<6} phish_ratio={total_phish/(total_benign+total_phish):>6.2%} total_samples={total_benign + total_phish:<6}"
)

dataset = datasets[random.randint(0, len(datasets) - 1)]
with open(dataset, "r", encoding="utf-8") as f:
    entries = [json.loads(line) for line in f]
    sample_entries = random.sample(entries, min(2, len(entries)))
    print(f"\nRandom samples from dataset: {dataset.name}")
    for sample in sample_entries:
        print(json.dumps(sample, ensure_ascii=False, indent=2))


Total datasets extracted: 7
TREC_06_with_captions.json     benign=12468  phish=3989   phish_ratio=24.24%
TREC_05_with_captions.json     benign=33044  phish=22946  phish_ratio=40.98%
CEAS_08_with_captions.json     benign=17312  phish=21842  phish_ratio=55.78%
Enron_with_captions.json       benign=15791  phish=13976  phish_ratio=46.95%
Ling_with_captions.json        benign=2401   phish=458    phish_ratio=16.02%
SpamAssasin_with_captions.json benign=4091   phish=1718   phish_ratio=29.57%
TREC_07_with_captions.json     benign=24358  phish=29399  phish_ratio=54.69%

Overall benign=109465 phish=94328  phish_ratio=46.29% total_samples=203793

Random samples from dataset: SpamAssasin_with_captions.json
{
  "id": 1564,
  "text": "Re: [Razor-users] Problem with Razor 2.14 and Spamassassin 2.41\n\nThis is due to insufficient write privileges to the \"razor-agent.log\"  file. A quick work-around is to do a \"chmod go+rx\" on that file (of  course, it's better to restrict the access as much as poss