In [5]:
import random
import mailbox
import email as eml
import pandas as pd
import re
import os

from bs4 import BeautifulSoup

**Preprocessing Phishing Emails**

In [140]:
def mbox_to_df(mbox):
    """
    Convert the text from emails in a .mbox file to a DataFrame.

    'text/plain' and 'text/html' only.
    """
    data = []
    for k in mbox.iterkeys(): # iterating through the mbox file
        try:
            message = mbox[k]
        except UnicodeDecodeError:
          pass
        row = {}
        # Extracting text
        text = ''
        if message.is_multipart():
            for part in message.walk(): # iterating through the message parts
                  content_type = part.get_content_type()
                  if content_type == 'text/html':
                      text = part.get_payload(decode=True).decode('latin-1')
                      text = parse_html(text)
                  elif content_type == 'text/plain':
                      text = part.get_payload(decode=True).decode('latin-1')
            replace_space = ["\n","\t", "©", "-"]
            replace_nothing = ["0x", "0", "1", "2" ,"3", "4", "5", "6", "7", "8", "9", ","]
            for char in replace_space:
                text = text.replace(char, " ")
            for char in replace_nothing:
                text = text.replace(char, "")
            text = re.sub(r'\s+', ' ', text)
            row['Body'] = text
            row['Label'] = 1
            data.append(row)
    dataframe = pd.DataFrame(data)

    return dataframe

def parse_html(text):
    soup = BeautifulSoup(text, 'lxml')

    inline_tags = ['a','abbr','acronym','b','bdo','button','cite','code',
                       'dfn','em','i','kbd','label','output','q','samp','small',
                       'span','strong','sub','sup','time','var']
    tags = soup.find_all(inline_tags)
    for t in tags:
        t.unwrap()
    new_soup = BeautifulSoup(str(soup), 'lxml')
    text = new_soup.get_text('\n', strip=True)
    return text

In [126]:
mbox_file_path = 'private-phishing4.mbox'

# Create a mailbox object
mbox = mailbox.mbox(mbox_file_path)
dataframe = mbox_to_df(mbox)
dataframe

Unnamed: 0,Body,Label
0,Dear GoDaddy Customer GoDaddy Customer Support...,1
1,Dear Citizens Bank and Charter One Bank custom...,1
2,eBay sent this message Your registered name is...,1
3,Dear Citizens Bank and Charter One Bank custom...,1
4,Dear Citizens Bank and Charter One Bank custom...,1
...,...,...
1405,Dear User Your e mail will expire soon We reco...,1
1406,,1
1407,Your email Address require security updates an...,1
1408,Dear jose@monkey.org Your two incoming mails w...,1


In [127]:
dataframe.to_csv("phishing_dataset.csv", index=False, escapechar="'")

**Preprocessing Ham Emails**

In [135]:
df = pd.read_csv('completeSpamAssassin.csv')
ham_only = df[df.columns.intersection(["Body","Label"])]
ham_only = ham_only[ham_only["Label"] == 0]
ham_only = ham_only[ham_only["Body"] != "empty"]
ham_only.to_csv("ham_dataset.csv", index=False)
df = pd.read_csv('ham_dataset.csv')
df

Unnamed: 0,Body,Label
0,"Date: Wed, 21 Aug 2002 10:54:46 -05...",0
1,"Martin A posted:\nTassos Papadopoulos, the Gre...",0
2,Man Threatens Explosion In Moscow Thursday Aug...,0
3,Klez: The Virus That Won't Die\n \nAlready the...,0
4,"> in adding cream to spaghetti carbonara, whi...",0
...,...,...
3947,----------------------------------------------...,0
3948,"EFFector Vol. 15, No. 35 November ...",0
3949,\nWe have extended our Free seat sale until Th...,0
3950,___ ___ ...,0


**Appending Phishing and Ham Datasets**

In [139]:
ham_only = pd.read_csv('ham_dataset.csv')
phishing = pd.read_csv('phishing_dataset.csv')
phishing_ham_dataset = phishing.append(ham_only, ignore_index=True)
phishing_ham_dataset.to_csv("phishing_ham_dataset.csv", index=False)
df = pd.read_csv('phishing_ham_dataset.csv')
df

  phishing_ham_dataset = phishing.append(ham_only, ignore_index=True)


Unnamed: 0,Body,Label
0,Dear GoDaddy Customer GoDaddy Customer Support...,1
1,Dear Citizens Bank and Charter One Bank custom...,1
2,eBay sent this message Your registered name is...,1
3,Dear Citizens Bank and Charter One Bank custom...,1
4,Dear Citizens Bank and Charter One Bank custom...,1
...,...,...
5357,----------------------------------------------...,0
5358,"EFFector Vol. 15, No. 35 November ...",0
5359,\nWe have extended our Free seat sale until Th...,0
5360,___ ___ ...,0
