# Enron (Class: Benign) - 39,472
https://www.kaggle.com/datasets/wcukierski/enron-email-dataset

In [None]:
import kagglehub

# Download latest version
enronRoot = kagglehub.dataset_download("wcukierski/enron-email-dataset")

print("Path to dataset files:", enronRoot)

In [None]:
import pandas as pd
enronFilePath = enronRoot+'/emails.csv'
dfEnron2 = pd.read_csv(enronFilePath)

dfEnron2

In [None]:
import pandas as pd
enronSliced = dfEnron2[0:39472]
enronSliced

In [None]:
import re
def parse_email(raw_email: str):
    lines = raw_email.splitlines()
    
    from_value = ""
    subject_value = ""
    body_lines = []
    in_body = False
    
    for line in lines:
        # Detect start of body (blank line after headers)
        if not in_body and line.strip() == "":
            in_body = True
            continue
        
        if in_body:
            body_lines.append(line)
        else:
            if line.lower().startswith("from:"):
                from_value = line[len("From:"):].strip()
            elif line.lower().startswith("subject:"):
                subject_value = line[len("Subject:"):].strip()
    
    body_value = "\n".join(body_lines).strip()
    
    links = re.findall(r'(https?://\S+)', body_value)
    
    # Enron dataset contains no attachment information so empty list
    attachments = []
    
    return pd.Series([from_value, subject_value, body_value, links, attachments])

In [None]:
enronDf = enronSliced['message'].apply(parse_email)
enronDf.columns = ['From', 'Subject', 'Body', 'Links', 'Attachments']
enronDf["Classification"] = "Benign"
enronDf

#enronDf.to_csv("Enron500k.csv", index=False)

# Enron Filtered (Class: Benign) - 16,545
https://www.kaggle.com/datasets/marcelwiechmann/enron-spam-data/data?select=enron_spam_data.csv

In [None]:
# Enron Filtered Dataset
import pandas as pd

enronFilePath = './enron_spam_data.csv'
dfEnron = pd.read_csv(enronFilePath)

dfEnron

In [None]:
# Isolate Enron Ham Entries

indicesToDrop = dfEnron[dfEnron['Spam/Ham']=='spam'].index

dfEnronHam = dfEnron.drop(indicesToDrop, inplace=False)
dfEnronHam

In [None]:
import re
def parse_email(subject, body):

    #no sender info in Ling or Enron Filtered
    sender = ""

    # Extract Links from body
    links = re.findall(r"https?://\S+", body, re.IGNORECASE)

    # Extract Attachments
    attachments = []

    return pd.Series([sender, subject, body, links, attachments])

#ling and enron ham seems to have no links
dfEnronHam[['Subject', 'Message']] = dfEnronHam[['Subject', 'Message']].fillna('')
enronDf2 = dfEnronHam.apply(lambda row: parse_email(row['Subject'], row['Message']), axis=1)
enronDf2.columns = ['From', 'Subject', 'Body', 'Links', 'Attachments']
enronDf2["Classification"] = "Benign"
enronDf2

#enronDf2.to_csv("Enron_Filtered.csv", index=False)


# Enron Filtered (Class: Spam) - 17,171
https://www.kaggle.com/datasets/marcelwiechmann/enron-spam-data

In [None]:
# Isolate Enron Spam Entries

indicesToDrop = dfEnron[dfEnron['Spam/Ham']=='ham'].index

dfEnronSpam = dfEnron.drop(indicesToDrop, inplace=False)
dfEnronSpam

In [None]:
dfEnronSpam[['Subject', 'Message']] = dfEnronSpam[['Subject', 'Message']].fillna('')
enronSpamDf = dfEnronSpam.apply(lambda row: parse_email(row['Subject'], row['Message']), axis=1)
enronSpamDf.columns = ['From', 'Subject', 'Body', 'Links', 'Attachments']
enronSpamDf["Classification"] = "Spam"
enronSpamDf

#enronSpamDf.to_csv("Enron_Spam.csv", index=False)

# Spam Assassin 2 (Class: Spam) - 1718
https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset?select=SpamAssasin.csv

In [None]:
# Dataset Containing Enron, NigerianFraud, SpamAssassin, Nazario, Ling datasets

import kagglehub

# Download latest version
spamAssassin2Root = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")

print("Path to dataset files:", spamAssassin2Root)

In [None]:
import pandas as pd
spamAssassin2FilePath = spamAssassin2Root+'/SpamAssasin.csv'
dfSpamAssassin2 = pd.read_csv(spamAssassin2FilePath)

dfSpamAssassin2

In [None]:
# Drop SpamAssassin Ham entries
indicesToDrop = dfSpamAssassin2[dfSpamAssassin2['label']==0].index
dfSpamAssassinSpam2 = dfSpamAssassin2.drop(indicesToDrop, inplace=False)

dfSpamAssassinSpam2

SpamAssassain2 Processing 

In [None]:
import re
def parse_email(sender, subject, body):

    # Extract Links from body
    links = re.findall(r"https?://\S+", body, re.IGNORECASE)

    # This dataset doesnt contain attachment information
    attachments = []
    
    return pd.Series([sender, subject, body, links, attachments])

dfSpamAssassin2[['sender', 'subject', 'body']] = dfSpamAssassin2[['sender', 'subject', 'body']].fillna('')
spamAssassinDf2 = dfSpamAssassin2.apply(lambda row: parse_email(row['sender'], row['subject'], row['body']), axis=1)
spamAssassinDf2.columns = ['From', 'Subject', 'Body', 'Links', 'Attachments']
spamAssassinDf2["Classification"] = "Spam"
spamAssassinDf2

#spamAssassinDf2.to_csv("SpamAssassin2.csv", index=False)

# Nazario (Class: Phishing) - 1565
https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset?select=Nazario.csv

In [None]:
# Dataset Containing Enron, NigerianFraud, SpamAssassin, Nazario, Ling datasets

import kagglehub

# Download latest version
nazarioRoot = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")

print("Path to dataset files:", nazarioRoot)

In [None]:
import pandas as pd
nazarioFilePath = nazarioRoot+'/Nazario.csv'
dfNazario = pd.read_csv(nazarioFilePath)

dfNazario

In [None]:
dfNazario[['sender', 'subject', 'body']] = dfNazario[['sender', 'subject', 'body']].fillna('')
nazarioDf = dfNazario.apply(lambda row: parse_email(row['sender'], row['subject'], row['body']), axis=1)
nazarioDf.columns = ['From', 'Subject', 'Body', 'Links', 'Attachments']
nazarioDf["Classification"] = "Phishing"
nazarioDf

#nazarioDf.to_csv("Nazario.csv", index=False)

# LLM (Class: Phishing) - 1000
https://www.kaggle.com/datasets/francescogreco97/human-llm-generated-phishing-legitimate-emails

In [None]:
# Dataset containing legitimate and phishing emails generated by humans and LLMs
# Label: 0 = Human Generated (From Nazario and Nigerian Fraud)
# Label: 1 = LLM Generated (ChatGPT and WormGPT)

import kagglehub

# Download latest version
llmGeneratedRoot = kagglehub.dataset_download("francescogreco97/human-llm-generated-phishing-legitimate-emails")

print("Path to dataset files:", llmGeneratedRoot)

In [None]:
import pandas as pd

llmGeneratedFilePath = llmGeneratedRoot+'/human-generated/phishing.csv'
dfLLMGenerated = pd.read_csv(llmGeneratedFilePath)

dfLLMGenerated

In [None]:
dfLLMGenerated[['sender', 'subject', 'body']] = dfLLMGenerated[['sender', 'subject', 'body']].fillna('')
llmDf = dfLLMGenerated.apply(lambda row: parse_email(row['sender'], row['subject'], row['body']), axis=1)
llmDf.columns = ['From', 'Subject', 'Body', 'Links', 'Attachments']
llmDf["Classification"] = "Phishing"
llmDf

#llmDf.to_csv("LLM.csv", index=False)

# Nigerian Fraud (Class: Financial Scams) - 3332
https://www.kaggle.com/datasets/naserabdullahalam/phishing-email-dataset?select=Nigerian_Fraud.csv

In [None]:
# Dataset Containing Enron, NigerianFraud, SpamAssassin, Nazario, Ling datasets

import kagglehub

# Download latest version
nigerianRoot = kagglehub.dataset_download("naserabdullahalam/phishing-email-dataset")

print("Path to dataset files:", nigerianRoot)

In [None]:
import pandas as pd
nigerianFilePath = nigerianRoot+'/Nigerian_Fraud.csv'
dfNigerian = pd.read_csv(nigerianFilePath)

dfNigerian

In [None]:
dfNigerian[['sender', 'subject', 'body']] = dfNigerian[['sender', 'subject', 'body']].fillna('')
nigerianDf = dfNigerian.apply(lambda row: parse_email(row['sender'], row['subject'], row['body']), axis=1)
nigerianDf.columns = ['From', 'Subject', 'Body', 'Links', 'Attachments']
nigerianDf["Classification"] = "Financial Scams"
nigerianDf

#nigerianDf.to_csv("Nigerian.csv", index=False)

# Ling - 481
https://www.kaggle.com/datasets/mandygu/lingspam-dataset

In [None]:
import kagglehub

# Download latest version
lingRoot = kagglehub.dataset_download("mandygu/lingspam-dataset")

print("Path to dataset files:", lingRoot)

In [None]:
import pandas as pd
lingFilePath = lingRoot+'/messages.csv'
dfLing = pd.read_csv(lingFilePath)

dfLing

In [None]:
indicesToDrop = dfLing[dfLing['label']==0].index
dfLingSpam = dfLing.drop(indicesToDrop, inplace=False)

dfLingSpam

In [None]:
import re
def parse_email(subject, body):

    #no sender info in Ling or Enron Filtered
    sender = ""

    # Extract Links from body
    links = re.findall(r"https?://\S+", body, re.IGNORECASE)

    # Extract Attachments
    attachments = []

    return pd.Series([sender, subject, body, links, attachments])

#ling seems to have no links
dfLingSpam[['subject', 'message']] = dfLingSpam[['subject', 'message']].fillna('')
lingDf = dfLingSpam.apply(lambda row: parse_email(row['subject'], row['message']), axis=1)
lingDf.columns = ['From', 'Subject', 'Body', 'Links', 'Attachments']
lingDf["Classification"] = "Spam"
lingDf

#lingDf.to_csv("Ling.csv", index=False)

# ChatGPT Generated (Class: BEC) - 3000

In [None]:
import pandas as pd

becRoot = './bec_emails.csv'
becDf = pd.read_csv(becRoot)

becDf["Classification"]="BEC"
becDf

# PhishTank (Class: Phishing) URLs only
https://phishtank.org/developer_info.php

# MalwareBazaar (Class: Malspam) 
https://bazaar.abuse.ch/browse/

# Create Training Set

In [None]:
trainingSet = pd.concat([enronDf, enronDf2, nazarioDf, llmDf, nigerianDf, lingDf, enronSpamDf, spamAssassinDf2, becDf], ignore_index=False)
trainingSet.to_csv("trainingSet.csv", index=False)

In [None]:
trainingSet