In [127]:
import pandas as pd

import re
from urllib.parse import urlparse

csv_path = "../email-dataset-figshare/Assassin.csv"
output_path = "../email-dataset-figshare/Cleaned/Assassin_cleaned.csv"
df = pd.read_csv(csv_path, nrows=2000)
df.head()
EMAIL_RE = re.compile(r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+\.[A-Z]{2,})', re.IGNORECASE)
URL_RE = re.compile(r'(https?://[^\s<>"]+|www\.[^\s<>"]+)', re.IGNORECASE)

import os

print("csv_path =", csv_path)
print("output_path =", output_path)
print("output exists?", os.path.exists(output_path))
print("output size (bytes) =", os.path.getsize(output_path) if os.path.exists(output_path) else None)

csv_path = ../email-dataset-figshare/Assassin.csv
output_path = ../email-dataset-figshare/Cleaned/Assassin_cleaned.csv
output exists? True
output size (bytes) = 30745873


Steps to cleaning Data:
1. Normalize all text fields (sender, receiver, subject, body)
2. Parse Sender Email & Domains
3. Parse Receiver Email & Domains
4. URL Extraction from Email Body
5. Parse Dates
6. Normalize Labels

In [128]:
# Function to Normalize text data

def normalize_text(x):
    if pd.isna(x):
        return ""
    x = str(x)
    x = x.replace("\r\n", "\n").replace("\r", "\n")
    x = x.replace("\x00", "")
    return x.strip()


In [129]:
# Function to find Sender Email & Domains

def extract_email_and_domain(sender_value):
    """
    Returns (email, domain) or (None, None) if not found.
    """
    if pd.isna(sender_value):
        return (None, None)
    
    text = str(sender_value).strip()

    # Find the first email pattern inside the sender field
    m = EMAIL_RE.search(text)
    if not m:
        return (None, None)

    email = m.group(0).lower()
    domain = m.group(2).lower().strip().strip(">").strip()

    # remove trailing punctuation just in case
    domain = domain.rstrip(".,;:")  
    return (email, domain)


In [130]:
df[["sender_email", "sender_domain"]] = df["sender"].apply(
    lambda x: pd.Series(extract_email_and_domain(x))
)

df[["sender", "sender_email", "sender_domain"]].head(10)


Unnamed: 0,sender,sender_email,sender_domain
0,Robert Elz <kre@munnari.OZ.AU>,kre@munnari.oz.au,munnari.oz.au
1,Steve Burt <Steve_Burt@cursor-system.com>,steve_burt@cursor-system.com,cursor-system.com
2,"""Tim Chapman"" <timc@2ubh.com>",timc@2ubh.com,2ubh.com
3,Monty Solomon <monty@roscom.com>,monty@roscom.com,roscom.com
4,Stewart Smith <Stewart.Smith@ee.ed.ac.uk>,stewart.smith@ee.ed.ac.uk,ee.ed.ac.uk
5,"""Martin Adamson"" <martin@srv0.ems.ed.ac.uk>",martin@srv0.ems.ed.ac.uk,srv0.ems.ed.ac.uk
6,"""Martin Adamson"" <martin@srv0.ems.ed.ac.uk>",martin@srv0.ems.ed.ac.uk,srv0.ems.ed.ac.uk
7,Stewart Smith <Stewart.Smith@ee.ed.ac.uk>,stewart.smith@ee.ed.ac.uk,ee.ed.ac.uk
8,"""Martin Adamson"" <martin@srv0.ems.ed.ac.uk>",martin@srv0.ems.ed.ac.uk,srv0.ems.ed.ac.uk
9,"""NOI Administrator"" <admin@networksonline.com>",admin@networksonline.com,networksonline.com


In [131]:
# Function to Find URLs in the body text

def extract_urls(text):
    if pd.isna(text):
        return []
    
    text = str(text)

    urls = URL_RE.findall(text) # Find all URLs in the text
    
    # clean URLs
    cleaned_urls = []
    for url in urls:
        url.rstrip(".,;:!")  # remove trailing punctuation
        cleaned_urls.append(url)
        
    return cleaned_urls
    
    

In [132]:
# Apply the function to extract URLs from the 'body' column (Test)

df["urls"] = df["body"].apply(extract_urls)

df[["body", "urls"]].head(20)

Unnamed: 0,body,urls
0,"Date: Wed, 21 Aug 2002 10:54:46 -0500 ...",[https://listman.redhat.com/mailman/listinfo/e...
1,"Martin A posted:\nTassos Papadopoulos, the Gre...",[http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HA...
2,Man Threatens Explosion In Moscow \n\nThursday...,[http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HA...
3,Klez: The Virus That Won't Die\n \nAlready the...,"[http://www.pcworld.com/news/article/0,aid,103..."
4,"> in adding cream to spaghetti carbonara, whi...","[http://www.ee.ed.ac.uk/~sxs/, http://us.click..."
5,> I just had to jump in here as Carbonara is o...,[http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HA...
6,The Scotsman - 22 August 2002\n\n Playboy want...,[http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HA...
7,Martin Adamson wrote: > > Isn't it just basic...,"[http://www.ee.ed.ac.uk/~sxs/, http://us.click..."
8,The Scotsman\n\n Thu 22 Aug 2002 \n\n Meaningf...,[http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HA...
9,I have been trying to research via SA mirrors ...,[https://www.inphonic.com/r.asp?r=sourceforge1...


In [133]:
df = pd.read_csv(csv_path)  # scale later

In [134]:
# Apply normalization to relevant columns

df["sender_raw"] = df["sender"].apply(normalize_text)
df["receiver_raw"] = df["receiver"].apply(normalize_text)
df["subject_raw"] = df["subject"].apply(normalize_text)
df["body_raw"] = df["body"].apply(normalize_text)
df["date_raw"] = df["date"].apply(normalize_text)


In [135]:
# Apply email functions to extract sender and receiver emails and domains

df[["sender_email", "sender_domain"]] = df["sender_raw"].apply(
    lambda x: pd.Series(extract_email_and_domain(x))
)

df[["receiver_email", "receiver_domain"]] = df["receiver_raw"].apply(
    lambda x: pd.Series(extract_email_and_domain(x))
)


In [136]:
# Normalize subject and body to lowercase

df["subject_clean"] = df["subject_raw"].str.lower()
df["body_clean"] = df["body_raw"].str.lower()

# Clean URLs from body text
df["urls"] = df["body_clean"].apply(extract_urls)
df["url_count"] = df["urls"].apply(len)


In [137]:
df["date_parsed"] = pd.to_datetime(df["date_raw"], errors="coerce")


  df["date_parsed"] = pd.to_datetime(df["date_raw"], errors="coerce")


In [138]:
df["label"] = df["label"].astype(int)


In [139]:
# Export to email-dataset-figshare/Cleaned/Assassin_cleaned.csv

final_cols = [
    "sender_raw", "sender_email", "sender_domain",
    "receiver_raw", "receiver_email", "receiver_domain",
    "date_raw", "date_parsed",
    "subject_raw", "subject_clean",
    "body_raw", "body_clean",
    "urls", "url_count",
    "label"
]

clean_df = df[final_cols]

clean_df.to_csv(
    output_path,
    index=False
)



