In [None]:
# === 01_data_preprocessing.ipynb ===

# Imports
import pandas as pd
import numpy as np
import re
import os
import spacy
from tqdm import tqdm

# Enable tqdm for pandas
tqdm.pandas()

# Load dataset
df = pd.read_csv("../data/raw/emails.csv")
df = df[["file", "message"]].dropna()
print(f"📥 Loaded {len(df)} emails.")

# Extract main email body (remove headers & long quotes)
def extract_body_from_message(message):
    try:
        body = re.split(r'\n\s*\n', message, maxsplit=1)[-1]  # remove headers
        body = re.sub(r'\s+', ' ', body)  # normalize spaces
        body = re.split(r'(-----Original Message-----|_+Forwarded by|From:.*@)', body)[0]  # cut long reply chains
        return body.strip()
    except:
        return ""

df["clean_body"] = df["message"].progress_apply(extract_body_from_message)

# Remove very long or very short messages
df = df[df["clean_body"].str.len() > 100]        # too short = useless
df = df[df["clean_body"].str.len() < 100000]     # too long = spaCy crash

# (Removed sampling step to use full dataset)

# Heuristic intent labeling using keywords
INTENT_KEYWORDS = {
    "Meeting Request": ["schedule", "meeting", "calendar", "call", "appointment"],
    "Job Inquiry": ["job", "resume", "position", "career", "apply"],
    "Finance": ["invoice", "payment", "amount", "fund", "salary", "finance"],
    "Legal": ["contract", "agreement", "terms", "clause", "lawyer"],
    "Appreciation": ["thank", "thanks", "grateful", "appreciate", "gratitude"],
    "Complaint": ["issue", "problem", "complaint", "error", "concern"],
    "Technical Support": ["bug", "support", "crash", "error", "fix", "install"],
    "Data Request": ["send", "forward", "email", "attach", "request"],
    "Greeting": ["hello", "hi", "greetings", "good morning"],
    "Farewell": ["regards", "bye", "sincerely", "take care"],
    "Sales Inquiry": ["quote", "pricing", "discount", "offer", "deal"],
    "Project Update": ["progress", "update", "status", "report"],
    "Reminder": ["remind", "deadline", "follow up", "due"],
    "Event Planning": ["venue", "event", "conference", "webinar", "party"],
    "Personal": ["family", "friend", "wedding", "vacation", "holiday"]
}

def assign_label(text):
    text = text.lower()
    for label, keywords in INTENT_KEYWORDS.items():
        if any(kw in text for kw in keywords):
            return label
    return "General Inquiry"

df["label"] = df["clean_body"].progress_apply(assign_label)

# Load spaCy model and set max length
nlp = spacy.load("en_core_web_sm")
nlp.max_length = 300000  # increase limit to avoid crash on long text

# 🔎 Extract named entities: PERSON, DATE, ORG, GPE
def extract_entities(text):
    doc = nlp(text)
    return {
        "PERSON": list(set(ent.text for ent in doc.ents if ent.label_ == "PERSON")),
        "DATE": list(set(ent.text for ent in doc.ents if ent.label_ == "DATE")),
        "ORG": list(set(ent.text for ent in doc.ents if ent.label_ == "ORG")),
        "GPE": list(set(ent.text for ent in doc.ents if ent.label_ == "GPE"))
    }

df["entities"] = df["clean_body"].progress_apply(extract_entities)

# Save cleaned dataset
os.makedirs("../data/processed/", exist_ok=True)
df.to_csv("../data/processed/clean_emails.csv", index=False)
print("✅ Cleaned data saved to: data/processed/clean_emails.csv")

# Preview
df[["file", "label", "clean_body", "entities"]].head(5)

📥 Loaded 517401 emails.


100%|██████████| 517401/517401 [00:41<00:00, 12443.62it/s]


🔍 Using 85110 emails after filtering and sampling.


100%|██████████| 85110/85110 [00:01<00:00, 81713.95it/s]
100%|██████████| 85110/85110 [1:07:23<00:00, 21.05it/s]


✅ Cleaned data saved to: data/processed/clean_emails.csv


Unnamed: 0,file,label,clean_body,entities
0,giron-d/_sent_mail/576.,Data Request,---------------------- Forwarded by Darron C G...,"{'PERSON': [], 'DATE': [], 'ORG': ['Enron Nort..."
1,watson-k/e_mail_bin/562.,General Inquiry,"RIGZONE DAILY NEWS -- TUESDAY, MARCH 12, 2002 ...","{'PERSON': [], 'DATE': ['MARCH 12, 2002', 'TUE..."
2,keavey-p/all_documents/409.,Meeting Request,The information contained herein is based on s...,"{'PERSON': ['Carr Futures'], 'DATE': ['2001'],..."
3,lokay-m/all_documents/633.,Job Inquiry,Commercialize your intelligence on the Edge th...,"{'PERSON': ['Cindy Olson's', 'Steve Hotte', 'S..."
4,meyers-a/deleted_items/1113.,Finance,To Whom this may concern: Please note that Col...,"{'PERSON': ['Bert Meyers'], 'DATE': ['711291',..."
