In [1]:
%matplotlib inline
import os
import tarfile
from six.moves import urllib
import email
import email.policy
import re
from html import unescape
import nltk
import urlextract
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score
from collections import Counter
import numpy as np
from scipy.sparse import csr_matrix
from wordcloud import WordCloud
import matplotlib as plt

Try visualization https://towardsdatascience.com/spam-classifier-in-python-from-scratch-27a98ddd8e73

# About the data

  - spam: 500 spam messages, all received from non-spam-trap sources.

  - easy_ham: 2500 non-spam messages.  These are typically quite easy to
    differentiate from spam, since they frequently do not contain any spammish
    signatures (like HTML etc).
  
Source: https://spamassassin.apache.org/old/publiccorpus/readme.html



files = ["20021010_easy_ham.tar.bz2",
         "20021010_hard_ham.tar.bz2",
         "20021010_spam.tar.bz2",
         "20030228_easy_ham.tar.bz2",
         "20030228_easy_ham_2.tar.bz2",
         "20030228_hard_ham.tar.bz2",
         "20030228_spam.tar.bz2",
         "20030228_spam_2.tar.bz2",
         "20050311_spam_2.tar.bz2"]

In [2]:
DOWNLOAD_ROOT = "https://spamassassin.apache.org/old/publiccorpus/"
DATASETS_PATH = os.path.join("Datasets")
FILES = ["20030228_easy_ham.tar.bz2",
         "20030228_spam.tar.bz2",]
DOWNLOAD_URLS = [DOWNLOAD_ROOT + name for name in FILES]

def fetch_data():
    if not os.path.isdir(DATASETS_PATH):
        os.makedirs(DATASETS_PATH)
        
    for filename, url in list(zip(FILES, DOWNLOAD_URLS)):
        path = os.path.join(DATASETS_PATH, filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url, path)
        current_tar_file = tarfile.open(path)
        current_tar_file.extractall(path=DATASETS_PATH)
        current_tar_file.close()
        

In [3]:
fetch_data()

In [4]:
spam_path = os.path.join(DATASETS_PATH, 'spam')
ham_path = os.path.join(DATASETS_PATH, 'easy_ham')
spam_filenames = [filename for filename in os.listdir(spam_path) if len(filename) > 10]
ham_filenames = [filename for filename in os.listdir(ham_path) if len(filename) > 10]

print("Spam emails:", len(spam_filenames), "\nNon-spam emails:", len(ham_filenames))

Spam emails: 500 
Non-spam emails: 2500


In [5]:
def read_email(is_spam, filename):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(DATASETS_PATH, directory, filename), "rb") as f:        
        return email.parser.BytesParser(policy=email.policy.default).parse(f)

    
spam_emails = [read_email(is_spam=True, filename=filename) for filename in spam_filenames]
ham_emails = [read_email(False, filename) for filename in ham_filenames]

In [6]:
# Spam examle
print(spam_emails[0].get_content())

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML><HEAD>
<META content="text/html; charset=windows-1252" http-equiv=Content-Type>
<META content="MSHTML 5.00.2314.1000" name=GENERATOR></HEAD>
<BODY><!-- Inserted by Calypso -->
<TABLE border=0 cellPadding=0 cellSpacing=2 id=_CalyPrintHeader_ rules=none 
style="COLOR: black; DISPLAY: none" width="100%">
  <TBODY>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TD></TR>
  <TR>
    <TD colSpan=3>
      <HR color=black noShade SIZE=1>
    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso --><FONT 
color=#000000 face=VERDANA,ARIAL,HELVETICA size=-2><BR></FONT></TD></TR></TABLE><!-- End Calypso --><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=#ff0000 
face="Copperplate Gothic Bold" size=5 PTSIZE="10">
<CENTER>Why Spend More Than You Have To?
<CENTER><FONT color=#ff0000 face="Copp

In [7]:
# Nonspam example
print(ham_emails[6].get_content())

The Scotsman - 22 August 2002

 Playboy wants to go out with a bang 
 
 
 AN AGEING Berlin playboy has come up with an unusual offer to lure women into
 his bed - by promising the last woman he sleeps with an inheritance of 250,000
 (£160,000). 
 
 Rolf Eden, 72, a Berlin disco owner famous for his countless sex partners,
 said he could imagine no better way to die than in the arms of an attractive
 young woman - preferably under 30. 
 
 "I put it all in my last will and testament - the last woman who sleeps with
 me gets all the money," Mr Eden told Bild newspaper. 
 
 "I want to pass away in the most beautiful moment of my life. First a lot of
 fun with a beautiful woman, then wild sex, a final orgasm - and it will all
 end with a heart attack and then Im gone." 
 
 Mr Eden, who is selling his nightclub this year, said applications should be
 sent in quickly because of his age. "It could end very soon," he said.


------------------------ Yahoo! Groups Sponsor ---------------------~

# Train and test split

In [8]:
X = ham_emails + spam_emails
y = [0]*len(ham_emails) + [1]*len(spam_emails)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)

# Converting emails into text

In [9]:
def clean_html(raw_html):
    cleantext = re.sub(r'<a.+?>', ' href url ', raw_html, flags=re.I | re.S | re.M)
    cleantext = re.sub(r'<.+?>', ' ', cleantext, flags=re.S | re.M)
    cleantext = re.sub(r'\s+', ' ', cleantext) 
    #cleantext = re.sub(r'(\s*\n)+', '\n', cleantext, flags=re.M | re.S)
    return unescape(cleantext)


In [10]:
cleaned_sample = clean_html(spam_emails[0].get_content())
print(cleaned_sample)

 Save up to 70% on Life Insurance. Why Spend More Than You Have To? Life Quote Savings Ensuring your family's financial security is very important. Life Quote Savings makes buying life insurance simple and affordable. We Provide FREE Access to The Very Best Companies and The Lowest Rates. Life Quote Savings is FAST, EASY and SAVES you money! Let us help you get started with the best values in the country on new coverage. You can SAVE hundreds or even thousands of dollars by requesting a FREE quote from Lifequote Savings. Our service will take you less than 5 minutes to complete. Shop and compare. SAVE up to 70% on all types of Life insurance! href url Click Here For Your Free Quote! Protecting your family is the best investment you'll ever make! If you are in receipt of this email in error and/or wish to be removed from our list, href url PLEASE CLICK HERE AND TYPE REMOVE. If you reside in any state which prohibits e-mail solicitations for insurance, please disregard this email. 


In [11]:
def email_to_text(raw_email):
    html = None
    for part in raw_email.walk():
        ctype = part.get_content_type()
        if ctype not in ('text/html', 'text/plain'):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == 'text/plain':
            return content
        else:
            html = content
    if html:
        return clean_html(html)
    

In [12]:
print(email_to_text(spam_emails[0])[:100], "...")

 Save up to 70% on Life Insurance. Why Spend More Than You Have To? Life Quote Savings Ensuring your ...


# Counting words in email

In [13]:
stemmer = nltk.PorterStemmer()
for word in ("Universe", "Universal", "University", "Computed", "Compute", "Compulsive"):
    print(word, "=>", stemmer.stem(word))

Universe => univers
Universal => univers
University => univers
Computed => comput
Compute => comput
Compulsive => compuls


Universe, Universal, and University have the same stem but different meanings, however that wont be a huge problem

In [14]:
url_extractor = urlextract.URLExtract()
print(url_extractor.find_urls(email_to_text(spam_emails[1])))

['http://www.adclick.ws/p.cfm?o=315&s=pk007', 'http://www.adclick.ws/p.cfm?o=249&s=pk007', 'http://www.adclick.ws/p.cfm?o=245&s=pk002', 'http://www.adclick.ws/p.cfm?o=259&s=pk007', 'http://www.adclick.ws/p.cfm?o=283&s=pk007', 'http://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258', 'http://www.linux.ie/mailman/listinfo/ilug']


In [15]:
class EmailToWordCounterTransformer(BaseEstimator, TransformerMixin):    
    def __init__(self, lower_case=True, remove_punctuation=True,
                 replace_urls=True, replace_numbers=True, stemming=True):
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_numbers = replace_numbers
        self.stemming = stemming
        self.replace_urls = replace_urls
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X_transformed = []
        if not isinstance(X, list):
            X = [X]
        for email in X:            
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls:
                urls = url_extractor.find_urls(text)
                for url in urls:
                    text = text.replace(url, " URL ")
            if self.replace_numbers:
                text = re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?', ' NUMBER ', text, flags=re.M | re.S)
            if self.remove_punctuation:
                text = re.sub(r'\W+', ' ', text, flags=re.M | re.S)
            word_counts = Counter(text.split())
            if self.stemming:
                stemmed_word_counts = Counter()
                for word, count in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += count
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)    
    

In [16]:
some_email = spam_emails[:3]
some_email_wordcounts = EmailToWordCounterTransformer().fit_transform(some_email)
some_email_wordcounts

array([Counter({'save': 8, 'you': 8, 'to': 6, 'life': 6, 'and': 6, 'quot': 5, 'the': 5, 'insur': 4, 'in': 4, 'number': 3, 'on': 3, 'your': 3, 'is': 3, 'free': 3, 'best': 3, 'of': 3, 'up': 2, 'than': 2, 'famili': 2, 'veri': 2, 'make': 2, 'or': 2, 'from': 2, 'our': 2, 'type': 2, 'href': 2, 'url': 2, 'click': 2, 'here': 2, 'for': 2, 'if': 2, 'thi': 2, 'email': 2, 'remov': 2, 'pleas': 2, 'whi': 1, 'spend': 1, 'more': 1, 'have': 1, 'ensur': 1, 's': 1, 'financi': 1, 'secur': 1, 'import': 1, 'buy': 1, 'simpl': 1, 'afford': 1, 'we': 1, 'provid': 1, 'access': 1, 'compani': 1, 'lowest': 1, 'rate': 1, 'fast': 1, 'easi': 1, 'money': 1, 'let': 1, 'us': 1, 'help': 1, 'get': 1, 'start': 1, 'with': 1, 'valu': 1, 'countri': 1, 'new': 1, 'coverag': 1, 'can': 1, 'hundr': 1, 'even': 1, 'thousand': 1, 'dollar': 1, 'by': 1, 'request': 1, 'a': 1, 'lifequot': 1, 'servic': 1, 'will': 1, 'take': 1, 'less': 1, 'minut': 1, 'complet': 1, 'shop': 1, 'compar': 1, 'all': 1, 'protect': 1, 'invest': 1, 'll': 1, 'ever':

# Vectorization

Now it's time to vectorize all counted words:
 
1. First I will count all words
2. Most common of them I will store in a N=1000 size dictionary 
3. All my vectors I will store in a <a href="https://machinelearningmastery.com/sparse-matrices-for-machine-learning/">sparce matrix</a> to decrease computational and space costs

In [17]:
class WordCounterToVectorTransformer(BaseEstimator, TransformerMixin):    
    def __init__(self, vocabulary_size=1000):
        self.vocabulary_size = vocabulary_size
    
    def fit(self, X, y=None):
        total_count = Counter()  
        for word_count in X:
            for word, count in word_count.items():
                total_count[word] += min(count,10)  # to prevent some word occurrence more than 10 times at one email
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word: index + 1 for index, (word, count) in enumerate(most_common)}  # +1 for top-1 (not top-0)
        return self
    
    def transform(self, X, y=None):
        rows = []
        cols = []
        data = []
        for i, word_count in enumerate(X):
            for word, count in word_count.items():
                rows.append(i)                
                cols.append(self.vocabulary_.get(word, 0))  # Store a word's index. 
                                                            # If a word isn't common then append it to index 0
                data.append(count)
        return csr_matrix((data, (rows, cols)), shape=(len(X), self.vocabulary_size + 1))
    

In [18]:
vocab_transformer = WordCounterToVectorTransformer(vocabulary_size=10)
few_vectors = vocab_transformer.fit_transform(some_email_wordcounts)
few_vectors

<3x11 sparse matrix of type '<class 'numpy.int32'>'
	with 25 stored elements in Compressed Sparse Row format>

In [19]:
# Top N=10 of most common words
vocab_transformer.vocabulary_

{'number': 1,
 'url': 2,
 'the': 3,
 'you': 4,
 'to': 5,
 'save': 6,
 'life': 7,
 'and': 8,
 'in': 9,
 'quot': 10}

In [20]:
few_vectors.toarray()

array([[126,   3,   2,   5,   8,   6,   8,   6,   6,   4,   5],
       [ 62,   8,   7,   4,   2,   2,   0,   0,   0,   1,   0],
       [ 46,   8,   6,   4,   2,   2,   0,   0,   0,   1,   0]],
      dtype=int32)

This array shows that the first email has 126 non common words, 3 'number', 2 'url', ... , 4 'in', 5 'quot'.

# Transformation pipeline 

In [21]:
preprocess_pipeline = Pipeline([
    ('email_to_words', EmailToWordCounterTransformer()),
    ('words_to_vectors', WordCounterToVectorTransformer()),
])

In [22]:
X_train_prepared = preprocess_pipeline.fit_transform(X_train)

# Model training

In [23]:
log_clsf = LogisticRegression(solver='liblinear', random_state=11)

In [24]:
score = cross_val_score(log_clsf, X_train_prepared, y_train, cv=3, verbose=3)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s


[CV]  ................................................................
[CV] .................................... , score=0.980, total=   0.0s
[CV]  ................................................................
[CV] .................................... , score=0.988, total=   0.1s
[CV]  ................................................................
[CV] .................................... , score=0.985, total=   0.1s


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s finished


In [25]:
print("Accuracy: {:0.2f}%".format(100*score.mean()))

Accuracy: 98.42%


In [26]:
log_clsf = LogisticRegression(solver='liblinear', random_state=11)
log_clsf.fit(X_train_prepared, y_train)

X_test_prepared = preprocess_pipeline.transform(X_test)
y_test_predicted = log_clsf.predict(X_test_prepared)

print("Precision: {:.2f}%".format(100*precision_score(y_test, y_test_predicted)))
print("Recall: {:.2f}%".format(100*recall_score(y_test, y_test_predicted)))

Precision: 96.40%
Recall: 95.54%


In [27]:
spam_letter = preprocess_pipeline.transform(spam_emails[400])
hui = log_clsf.predict(spam_letter)
hui

array([1])

# Visualizing results

In [28]:
coefs = log_clsf.coef_[0]
vocabulary = preprocess_pipeline.named_steps['words_to_vectors'].vocabulary_
vocabulary['noncommon'] = 0
words_and_coefs = list(zip(coefs, vocabulary))

In [31]:
spam_words = [(value, word) for value, word in words_and_coefs if value > 0]
ham_words = [(value, word) for value, word in words_and_coefs if value < 0] 

print("Anti spam markers")
ham_words.sort()
ham_words

Anti spam markers


[(-2.2243118301081775, 'wrote'),
 (-1.0232320816484044, 'been'),
 (-0.973389203892379, 'have'),
 (-0.7863059640851457, 'look'),
 (-0.5554670050317184, 'net'),
 (-0.5392427043143603, 'do'),
 (-0.5272890265411504, 'can'),
 (-0.5260879941338577, 'sourc'),
 (-0.5163429078308354, 'becaus'),
 (-0.5066237762248326, 'be'),
 (-0.5051787429633611, 'is'),
 (-0.4982470746974006, 'ie'),
 (-0.4979172605048754, 'our'),
 (-0.485056644814553, 's'),
 (-0.47993575056843324, 'take'),
 (-0.47069445681888683, 'are'),
 (-0.4704395297859096, 'email'),
 (-0.4518127945719671, 'and'),
 (-0.43729147474824137, 'use'),
 (-0.40166494648811085, 'group'),
 (-0.39795745231991847, 'some'),
 (-0.39671030045785055, 'industri'),
 (-0.39192369412847267, '_______________________________________________'),
 (-0.38991912095000647, 'also'),
 (-0.3763136331092111, 'like'),
 (-0.3536347323509027, 'too'),
 (-0.3522206698741136, 'link'),
 (-0.3506148255129047, 'within'),
 (-0.3444201866993814, 'right'),
 (-0.3431566674827539, 'thro

I supposed that 'free' will be an absolute spam marker, but it is not. What if it depends on the context? Say "free" + "url" separately have weights -0.334 and -0.1867, but together some positive weight.

In [32]:
print("Spam markers")
spam_words.sort(reverse=True)
spam_words

Spam markers


[(1.4479740530198808, 'set'),
 (1.3347803360047223, 've'),
 (0.8786941483810145, 'user'),
 (0.7503766541133587, 'thing'),
 (0.7173849687043966, 'googl'),
 (0.6546377353715828, 'm'),
 (0.5956335368539786, 'most'),
 (0.5867489304150983, 'john'),
 (0.5858107006796366, 'question'),
 (0.5717247800787512, 'these'),
 (0.5700211544427483, 'thi'),
 (0.5508061708571551, 'xent'),
 (0.4974372460625462, 'book'),
 (0.4834404347505117, 'move'),
 (0.4807977922969873, 'high'),
 (0.4486995924921731, 'rpm'),
 (0.4465127099301383, 'still'),
 (0.4439558363659042, 'list'),
 (0.4291818951902267, 'would'),
 (0.42848779815639804, 'veri'),
 (0.42824693185029106, 'yahoo'),
 (0.42219100894316064, 'them'),
 (0.4144063686695424, 'un'),
 (0.41363307672400457, 'report'),
 (0.38892010712714126, 'we'),
 (0.37964884487654316, 'send'),
 (0.37455144920025435, 'copyright'),
 (0.3735709794191762, 'support'),
 (0.3730512575243844, 'els'),
 (0.3724295152851689, 'internet'),
 (0.3686767905759167, 'site'),
 (0.35377307098892324

Interesting, what does 've' mean?