# Script for phishing email dataset processing and style label extraction

## Prerequisites setup

Put the downloaded dataset in `data_phish/raw/` folder.

For the example, we use the [Phishing Email Dataset](https://figshare.com/articles/dataset/Curated_Dataset_-_Phishing_Email/24899952) from figshare.

In [70]:
!ls ../../data_phish/raw/ | grep .csv

CEAS_08.csv
Enron.csv
Ling.csv
SpamAssasin.csv
TREC_05.csv
TREC_06.csv
TREC_07.csv


In [71]:
import csv
import os
import re
import sys
from pathlib import Path

import pandas as pd

In [72]:
csv.field_size_limit(
    10**7
)  # Increase CSV field size limit to avoid errors on large email bodies
root_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if root_path not in sys.path:
    sys.path.append(root_path)
print(root_path)
RAW_DATA_DIRECTORY = Path(root_path) / "data_phish" / "raw"

/home/yuhao/workspace/data-sythesis-research


## Define label extraction function

Here we select two dimensions of style labels: (1) Style/Tone, (2) Purpose.

In [73]:
def label_email(text: str) -> dict:
    """Label the email text with two attributes: style_tone and purpose."""

    text_lower = text.lower()
    labels = {"style_tone": [], "purpose": []}

    # ---- 1. Style / Tone ----
    if re.search(r"\burgent\b|\bimmediately\b|asap|within \d+ hours", text_lower):
        labels["style_tone"].append("urgent")
    elif re.search(r"\bdear\b|sincerely|regards", text_lower):
        labels["style_tone"].append("formal")
    elif re.search(r"\bhey\b|\bhi\b|\bthanks\b", text_lower):
        labels["style_tone"].append("informal")
    elif re.search(r"sale|discount|offer|promotion", text_lower):
        labels["style_tone"].append("marketing")
    else:
        labels["style_tone"].append("other")

    # ---- 2. Purpose ----
    if re.search(r"password|verify|account|login", text_lower):
        labels["purpose"].append("account")
    elif re.search(r"sale|offer|discount|buy now", text_lower):
        labels["purpose"].append("advertisement")
    elif re.search(r"meeting|report|project|schedule", text_lower):
        labels["purpose"].append("business")
    else:
        labels["purpose"].append("other")

    return labels

## Process the dataset and extract style labels

In [74]:
def process_dataset(dataset_file: Path) -> None:
    """Process raw email texts and label them. Store the result in a JSON file."""
    df = pd.read_csv(dataset_file, engine="python")
    if (
        "subject" not in df.columns
        or "body" not in df.columns
        or "label" not in df.columns
    ):
        raise ValueError(
            f"Dataset {dataset_file} must contain 'subject', 'body', and 'label' columns."
        )
    df["phish"] = df["label"]
    df["text"] = df["subject"].fillna("") + "\n\n" + df["body"].fillna("")
    df["labels"] = df["text"].apply(label_email)
    output_file = dataset_file.with_suffix(".json")
    df[["text", "labels", "phish"]].to_json(output_file, orient="records", lines=True)
    print(f"Processed {dataset_file}, saved to {output_file}")


raw_data_dir = RAW_DATA_DIRECTORY.absolute()
datasets = [f for f in raw_data_dir.iterdir() if f.suffix == ".csv"]
for dataset in datasets:
    json_file = dataset.with_suffix(".json")
    if json_file.exists():
        print(f"Skipped {dataset}, {json_file} already exists.")
        continue
    process_dataset(dataset)

Skipped /home/yuhao/workspace/data-sythesis-research/data_phish/raw/Enron.csv, /home/yuhao/workspace/data-sythesis-research/data_phish/raw/Enron.json already exists.
Skipped /home/yuhao/workspace/data-sythesis-research/data_phish/raw/SpamAssasin.csv, /home/yuhao/workspace/data-sythesis-research/data_phish/raw/SpamAssasin.json already exists.
Skipped /home/yuhao/workspace/data-sythesis-research/data_phish/raw/TREC_07.csv, /home/yuhao/workspace/data-sythesis-research/data_phish/raw/TREC_07.json already exists.
Skipped /home/yuhao/workspace/data-sythesis-research/data_phish/raw/Ling.csv, /home/yuhao/workspace/data-sythesis-research/data_phish/raw/Ling.json already exists.
Skipped /home/yuhao/workspace/data-sythesis-research/data_phish/raw/TREC_05.csv, /home/yuhao/workspace/data-sythesis-research/data_phish/raw/TREC_05.json already exists.
Skipped /home/yuhao/workspace/data-sythesis-research/data_phish/raw/TREC_06.csv, /home/yuhao/workspace/data-sythesis-research/data_phish/raw/TREC_06.jso

## Examine the processed dataset

In [75]:
import json


for dataset_file in datasets:
    json_file = dataset_file.with_suffix(".json")
    with open(json_file, "r", encoding="utf-8") as f:
        phish_count = 0
        benign_count = 0
        total = 0
        style_count = 0
        for line in f:
            entry = json.loads(line)
            total += 1
            if entry["phish"]:
                phish_count += 1
            else:
                benign_count += 1
            style_tone = entry.get("labels", {}).get("style_tone", [])
            if style_tone and any(s != "other" for s in style_tone):
                style_count += 1
        style_ratio = style_count / total if total else 0
        print(
            f"{json_file.name:<20} total={total:<6} phish={phish_count:<6} benign={benign_count:<6} phish_ratio={phish_count/total:>6.2%} style={style_count:<6} style_ratio={style_ratio:>6.2%}"
        )

Enron.json           total=29767  phish=13976  benign=15791  phish_ratio=46.95% style=15290  style_ratio=51.37%
SpamAssasin.json     total=5809   phish=1718   benign=4091   phish_ratio=29.57% style=2029   style_ratio=34.93%
TREC_07.json         total=53757  phish=29399  benign=24358  phish_ratio=54.69% style=25066  style_ratio=46.63%
Ling.json            total=2859   phish=458    benign=2401   phish_ratio=16.02% style=1072   style_ratio=37.50%
TREC_05.json         total=55990  phish=22946  benign=33044  phish_ratio=40.98% style=26727  style_ratio=47.74%
TREC_06.json         total=16457  phish=3989   benign=12468  phish_ratio=24.24% style=8235   style_ratio=50.04%
CEAS_08.json         total=39154  phish=21842  benign=17312  phish_ratio=55.78% style=12449  style_ratio=31.79%


In [76]:
import random


for dataset_file in datasets:
    json_file = dataset_file.with_suffix(".json")
    benign_samples = []
    phish_samples = []
    with open(json_file, "r", encoding="utf-8") as f:
        entries = [json.loads(line) for line in f]
        benign_entries = [e for e in entries if not e["phish"]]
        phish_entries = [e for e in entries if e["phish"]]
        benign_samples = random.sample(benign_entries, min(2, len(benign_entries)))
        phish_samples = random.sample(phish_entries, min(2, len(phish_entries)))
    print(f"\nDataset: {json_file.name}")
    print("Benign samples:")
    for sample in benign_samples:
        print(json.dumps(sample, ensure_ascii=False, indent=2))
    print("Phish samples:")
    for sample in phish_samples:
        print(json.dumps(sample, ensure_ascii=False, indent=2))


Dataset: Enron.json
Benign samples:
{
  "text": "research programs and analytics\n\nunder the terms of the agreements with ubs , all of the research software programs are being licensed to the new trading company and are listed on the schedule of proprietary software . in addition , under the terms of the license , enron must provide the new trading company with copies of the source code and object code as well as all associated documentation for these applications and all databases and other data relating to the gas and power trading business .\nplease call if you have any questions",
  "labels": {
    "style_tone": [
      "other"
    ],
    "purpose": [
      "business"
    ]
  },
  "phish": 0
}
{
  "text": "start date : 1 / 30 / 02 ; hourahead hour : 10 ;\n\nstart date : 1 / 30 / 02 ; hourahead hour : 10 ; no ancillary schedules awarded . no variances detected .\nlog messages :\nparsing file - - > > o : \\\\ portland \\\\ westdesk \\\\ california scheduling \\\\ iso final schedule