## Spam Classifier Preprocess

Write a data preparation pipeline to convert each email into a feature vector. Your preparation pipeline should transform an email into a (sparse) vector indicating the presence or absence of each possible word. For example, if all emails only ever contain four words, “Hello,” “how,” “are,” “you,” then the email “Hello you Hello Hello you” would be converted into a vector [1, 0, 0, 1] (meaning [“Hello” is present, “how” is absent, “are” is absent, “you” is present]), or [3, 0, 0, 2] if you prefer to count the number of occurrences of each word.

In [74]:
# list files
import os
import numpy as np
import operator

test_dir = "./easy_ham/" # "./hard_ham/" #
files = os.listdir(test_dir)
# files = [f for f in os.listdir('./easy_ham/') if os.path.isfile('./easy_ham/' + f)]
test_file_path = test_dir + files[0]
print("Test file path:" + test_file_path)

Test file path:./easy_ham/2170.78c282a5e417d6d231dc75aa8588ebb7


In [75]:
# read files
# test_file_path = "./spam/00500.85b72f09f6778a085dc8b6821965a76f"
# test_file_path = "./hard_ham/0239.34e6b6125909c0d998370aacc82daefe"
test_file = open(test_file_path, "r", errors='ignore')
content = test_file.read()
print(content)

From rssfeeds@jmason.org  Mon Sep 30 13:43:46 2002
Return-Path: <rssfeeds@example.com>
Delivered-To: yyyy@localhost.example.com
Received: from localhost (jalapeno [127.0.0.1])
	by jmason.org (Postfix) with ESMTP id AE79816F16
	for <jm@localhost>; Mon, 30 Sep 2002 13:43:46 +0100 (IST)
Received: from jalapeno [127.0.0.1]
	by localhost with IMAP (fetchmail-5.9.0)
	for jm@localhost (single-drop); Mon, 30 Sep 2002 13:43:46 +0100 (IST)
Received: from dogma.slashnull.org (localhost [127.0.0.1]) by
    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g8U81fg21359 for
    <jm@jmason.org>; Mon, 30 Sep 2002 09:01:41 +0100
Message-Id: <200209300801.g8U81fg21359@dogma.slashnull.org>
To: yyyy@example.com
From: gamasutra <rssfeeds@example.com>
Subject: Priceless Rubens works stolen in raid on mansion
Date: Mon, 30 Sep 2002 08:01:41 -0000
Content-Type: text/plain; encoding=utf-8
Lines: 6
X-Spam-Status: No, hits=-527.4 required=5.0
	tests=AWL,DATE_IN_PAST_03_06,T_URI_COUNT_0_1
	version=2.50-cvs
X-Spam

### TODO:

- **Lower-casing:** The entire email is converted into lower case, so that captialization is ignored (e.g., IndIcaTE is treated the same as Indicate).

- **Stripping HTML:** All HTML tags are removed from the emails. Many emails often come with HTML formatting; we remove all the HTML tags, so that only the content remains.

- **Normalizing URLs:** All URLs are replaced with the text “httpaddr”.

- **Normalizing Email Addresses:** All email addresses are replaced with the text “emailaddr”.

- **Normalizing Numbers:** All numbers are replaced with the text “number”.

- **Normalizing Dollars:** All dollar signs ($) are replaced with the text “dollar”.

- **Word Stemming:** Words are reduced to their stemmed form. For ex- ample, “discount”, “discounts”, “discounted” and “discounting” are all replaced with “discount”. Sometimes, the Stemmer actually strips off additional characters from the end, so “include”, “includes”, “included”, and “including” are all replaced with “includ”.

- **Removal of non-words:** Non-words and punctuation have been re- moved. All white spaces (tabs, newlines, spaces) have all been trimmed to a single space character.

In [76]:
# Tools

import re
import email
from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)
    
def strip_http_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

def replace_url(input_str):
    return re.sub(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+", 'httpaddr', input_str)

def replace_email_address(input_str):
    return re.sub(r"[^\s]+@[^\s]+", 'emailaddr', input_str)

def replace_number(input_str):
    return re.sub(r"\d", ' number ', input_str)

def replace_doller(input_str):
    return re.sub(r"\$", 'doller', input_str)

def replace_punctuation(input_str):
    return re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——!-=，。？、~@;:#￥%……&*\(\)\[\]]+", ' ', input_str)


In [77]:
def preprocess_email_content(content, ignore_header=True):
    email_content = ""
    if ignore_header:
        msg = email.message_from_string(content)
        if msg.is_multipart():
            for payload in meg.get_payload():
                email_content = email_content + payload.get_payload()
        else:
            email_content = msg.get_payload()
    else:
        email_content = content
    
    email_content = email_content.lower()                # lower case
    email_content = strip_http_tags(email_content)       # strip http tags
    email_content = replace_url(email_content)           # replace url
    email_content = replace_email_address(email_content) # replace email address
    email_content = replace_number(email_content)        # replace number
    email_content = replace_doller(email_content)        # replace doller
    email_content = replace_punctuation(email_content)   # 
    return email_content

In [78]:
# create hashmap for list
from nltk.stem import PorterStemmer
ps = PorterStemmer()
def convert_email_content_to_dict(content, word_dict, enable_word_stemming=True):
    for word in content.split():
        if enable_word_stemming:
            word = ps.stem(word)
            
        if word in word_dict:
            count = word_dict[word]
            word_dict[word] = count+1
        else:
            word_dict[word] = 1

In [79]:
# Save and Load object
import pickle
def save_obj(obj, name ):
    with open('./'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('./' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [80]:
# From text file to word dict
test_file = open(test_file_path, "r")
file_text = test_file.read()
processed_content = preprocess_email_content(file_text, ignore_header=False)
# print(processed_content)
word_dict = {}
convert_email_content_to_dict(processed_content, word_dict)
print(word_dict)
test_file.close()

{'from': 5, 'emailaddr': 5, 'mon': 5, 'sep': 5, 'number': 166, 'return': 1, 'path': 1, 'deliv': 1, 'to': 2, 'receiv': 3, 'localhost': 3, 'jalapeno': 2, 'by': 4, 'jmason': 1, 'org': 3, 'postfix': 1, 'with': 3, 'esmtp': 2, 'id': 3, 'ae': 1, 'f': 1, 'for': 3, 'ist': 2, 'imap': 1, 'fetchmail': 1, 'singl': 1, 'drop': 1, 'dogma': 2, 'slashnul': 2, 'g': 1, 'u': 1, 'fg': 1, 'messag': 1, 'gamasutra': 1, 'subject': 1, 'priceless': 1, 'ruben': 1, 'work': 1, 'stolen': 1, 'in': 2, 'raid': 2, 'on': 2, 'mansion': 1, 'date': 3, 'content': 1, 'type': 1, 'text': 1, 'plain': 1, 'encod': 1, 'utf': 1, 'line': 1, 'x': 2, 'spam': 2, 'statu': 1, 'no': 1, 'hit': 1, 'requir': 1, 'test': 1, 'awl': 1, 'past': 1, 't': 2, 'uri': 1, 'count': 1, 'version': 1, 'cv': 1, 'level': 1, 'url': 1, 'httpaddr': 1, 'click': 1, 'art': 2, 'fourth': 1, 'philanthropist': 1, 's': 1, 'home': 1, 'onc': 1, 'target': 1, 'the': 1, 'ira': 1, 'and': 1, 'dublin': 1, 'gangster': 1, 'martin': 1, 'cahil': 1}


### Create Word List

In [81]:
import operator
from ipywidgets import FloatProgress
from IPython.display import display

# progress bar
progress_bar = FloatProgress(min=0, max=6852, description='Processing:') # instantiate the bar
display(progress_bar) # display the bar

sample_directories = ["./easy_ham/", "./hard_ham/", "./spam/"]
word_dict = {}
for directory in sample_directories:
    files = os.listdir(directory)
    for file in files:
        file_path = directory + file
#         print("file_path:", file_path)
        # process email
        email_file = open(file_path, "r", errors='ignore')
        file_text = email_file.read()
        processed_content = preprocess_email_content(file_text, ignore_header=False)
        convert_email_content_to_dict(processed_content, word_dict)
        email_file.close()
        progress_bar.value += 1
print("Total words:", len(word_dict))

# Sort dict by word count
sorted_result = sorted(word_dict.items(), key=operator.itemgetter(1), reverse=True)
sorted_word_dict = dict(sorted_result)

# Print top ten words
print("Top ten words:")
item_list = list(sorted_word_dict.items())
for i in np.arange(0, 10):
    print(item_list[i])

save_obj(word_dict, "word_dict")
print("Save dict to ./word_dict.pkl")

FloatProgress(value=0.0, description='Processing:', max=6852.0)

Total words: 151385
Top ten words:
('number', 2531063)
('the', 85736)
('to', 85228)
('emailaddr', 65339)
('from', 62193)
('a', 60257)
('for', 51849)
('with', 46690)
('by', 45732)
('and', 44800)
Save dict to ./word_dict.pkl


### Convert email content to word indices table

In [82]:
# prepared vocabulary word list
vocabulary_dict = dict(sorted_result[:2500])
index = 0
for key in vocabulary_dict.keys():
    vocabulary_dict[key] = index;
    index += 1

In [83]:
def convert_email_content_to_word_indices(processed_content, enable_word_stemming=True):
    word_indices = []
    for word in content.split():
        if enable_word_stemming:
            word = ps.stem(word)
            
        if word in vocabulary_dict:
            word_indices.append(vocabulary_dict[word])
    return word_indices

In [84]:
# From text file to word indices
test_file = open(test_file_path, "r")
file_text = test_file.read()
processed_content = preprocess_email_content(file_text, ignore_header=False)
test_indices = convert_email_content_to_word_indices(processed_content)
print(test_indices)
test_file.close()

[4, 53, 40, 4, 31, 8, 7, 25, 13, 6, 40, 4, 115, 8, 31, 7, 108, 6, 40, 4, 8, 7, 25, 13, 6, 40, 148, 15, 2148, 36, 40, 1794, 2148, 36, 247, 540, 915, 8, 1, 9, 984]


In [None]:
# progress bar
progress_bar = FloatProgress(min=0, max=6852, description='Processing:') # instantiate the bar
display(progress_bar) # display the bar

sample_directories = ["./easy_ham/", "./hard_ham/", "./spam/"]
summary_file_names = ["word_indices_easy_ham", "word_indices_hard_ham", "word_indices_spam"]
for directory, file_name in zip(sample_directories, summary_file_names):
    files = os.listdir(directory)
    word_indices_table = []
    for file in files:
        file_path = directory + file
#         print("file_path:", file_path)
        # process email
        email_file = open(file_path, "r", errors='ignore')
        file_text = email_file.read()
        processed_content = preprocess_email_content(file_text, ignore_header=False)
        word_indices = convert_email_content_to_word_indices(processed_content)
        word_indices_table.append(word_indices)
        progress_bar.value += 1
    save_obj(word_indices_table, file_name)


FloatProgress(value=0.0, description='Processing:', max=6852.0)