In [3]:
import os
import tarfile
import urllib.request

DOWNLOAD_ROOT = "http://spamassassin.apache.org/old/publiccorpus/"
HAM_URL = DOWNLOAD_ROOT + "20030228_easy_ham.tar.bz2"
SPAM_URL = DOWNLOAD_ROOT + "20030228_spam.tar.bz2"
SPAM_PATH = os.path.join("datasets", "spam")

def fetch_spam_data(ham_url=HAM_URL , spam_url=SPAM_URL , spam_path=SPAM_PATH):
    if not os.path.isdir(spam_path):
        os.makedirs(spam_path)
    for filename, url in (("ham.tar.bz2",ham_url) , ("spam.tar.bz2", spam_url)):
        path = os.path.join(spam_path , filename)
        if not os.path.isfile(path):
            urllib.request.urlretrieve(url , path)
        extract_file = tarfile.open(path)
        extract_file.extractall(path = spam_path)
        extract_file.close()
    

In [4]:
fetch_spam_data()

In [5]:
HAM_DIR = os.path.join(SPAM_PATH , 'easy_ham')
SPAM_DIR = os.path.join(SPAM_PATH , 'spam')

spam_filesnames = [name for name in sorted(os.listdir(SPAM_DIR)) if len(name) > 5]
ham_filesnames = [name for name in sorted(os.listdir(HAM_DIR)) if len(name) > 5]

In [6]:
print(len(ham_filesnames))
print(len(spam_filesnames))

2500
500


In [7]:
import email
import email.policy

def load_email(is_spam , filename , spam_path = SPAM_PATH):
    directory = "spam" if is_spam else "easy_ham"
    with open(os.path.join(spam_path , directory, filename) , 'rb') as f:
        return email.parser.BytesParser(policy = email.policy.default).parse(f)

In [8]:
ham_emails = [load_email(is_spam = False , filename =name ) for name in ham_filesnames]
spam_email = [load_email(is_spam = True, filename = name) for name in spam_filesnames]

In [9]:
print(ham_emails[0].get_content().strip())
print("-"*100)
print(spam_email[0].get_content().strip())

Date:        Wed, 21 Aug 2002 10:54:46 -0500
    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>
    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>


  | I can't reproduce this error.

For me it is very repeatable... (like every time, without fail).

This is the debug log of the pick happening ...

18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}
18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury
18:19:04 Ftoc_PickMsgs {{1 hit}}
18:19:04 Marking 1 hits
18:19:04 tkerror: syntax error in expression "int ...

Note, if I run the pick command by hand ...

delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury
1 hit

That's where the "1 hit" comes from (obviously).  The version of nmh I'm
using is ...

delta$ pick -version
pick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 

In [10]:
def get_email_structure(email):
    if isinstance(email, str):
        return email
    
    if isinstance(email , list):
        return "multipart({})".format(','.join([
            get_email_structure(sub_email) for sub_email in payload
        ]))
    else:
        return email.get_content_type()

In [11]:
from collections import Counter

def structures_counter(emails):
    structures = Counter()
    for email in emails:
        structure = get_email_structure(email)
        structures[structure] += 1
    return structures

In [12]:
structures_counter(ham_emails).most_common()

[('text/plain', 2408),
 ('multipart/signed', 68),
 ('multipart/mixed', 10),
 ('multipart/alternative', 9),
 ('multipart/related', 3),
 ('multipart/report', 2)]

In [13]:
structures_counter(spam_email).most_common()

[('text/plain', 218),
 ('text/html', 183),
 ('multipart/alternative', 47),
 ('multipart/mixed', 43),
 ('multipart/related', 9)]

In [14]:
for header,values in spam_email[0].items():
    print(header,':' ,  values)

Return-Path : <12a1mailbot1@web.de>
Delivered-To : zzzz@localhost.spamassassin.taint.org
Received : from localhost (localhost [127.0.0.1])	by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 136B943C32	for <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)
Received : from mail.webnote.net [193.120.211.219]	by localhost with POP3 (fetchmail-5.9.0)	for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)
Received : from dd_it7 ([210.97.77.167])	by webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623	for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:09:41 +0100
From : 12a1mailbot1@web.de
Received : from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);	 Sat, 24 Aug 2002 09:42:10 +0900
To : dcek1a1@netsgo.com
Subject : Life Insurance - Why Pay More?
Date : Wed, 21 Aug 2002 20:31:57 -1600
MIME-Version : 1.0
Message-ID : <0103c1042001882DD_IT7@dd_it7>
Content-Type : text/html; charset="iso-8859-1"
Content-Transfer-Encoding : qu

In [15]:
spam_email[0]['Subject']

'Life Insurance - Why Pay More?'

In [16]:
import numpy as np
from sklearn.model_selection import train_test_split

X = np.array(spam_email + ham_emails , dtype= 'object')
y = np.array(len(spam_email)*[1] + len(ham_emails)*[0])

X_train,X_test , y_train,y_test = train_test_split(X,y,test_size = 0.2 , random_state = 42)

In [17]:
import re
from html import unescape

def html_to_plain_text(html):
    
    text = re.sub("<head.*?>.*?</head>" , "", html, flags=re.M | re.S|re.I)
    text = re.sub("<a/s*?>", "HYPERLINK", text,flags = re.M | re.S|re.I)
    text = re.sub("<.?>", " ", text , flags = re.M|re.S|re.I)
    text = re.sub(r"(\s\n)+" , "\n" , text , flags= re.M | re.S|re.I)
    return unescape(text)

In [18]:
html_spam_emails = [email for email in X_train[y_train==1] if get_email_structure(email)=='text/html']
sample_spam_email = html_spam_emails[0]
print(sample_spam_email.get_content().strip()[:1000] +"\n .....")

<p>We thank you for just a moment of your time.  NextResearch is inviting you to join a panel of consumer electronics users now being created to help manufacturers, network programmers, and entertainment companies shape their future offerings.  In exchange for your willingness to participate, there will be prizes and incentives awarded. ALL CONTACT INFORMATION WILL BE HELD IN STRICTEST CONFIDENCE AND WE WILL NEVER TRY TO SELL YOU ANYTHING.  You will be able to opt-out of the panel at any time.</p> 
					Please click here <a href=http://65.19.137.17/nextresearch/nr.htm>http://65.19.137.17/nextresearch/nr.htm</a> if you would like to participate in your first survey and earn a chance to win one of 25 new Digital Video Recorders being awarded in September!  (You do not have to join the panel to participate in this survey.) This is a national market research program conducted with the highest ethical standards. Feel free to contact program director, Jennifer Choate at 901.491.4995 with any

In [19]:
print(html_to_plain_text(sample_spam_email.get_content())[:1000] + "\n ....")

  We thank you for just a moment of your time.  NextResearch is inviting you to join a panel of consumer electronics users now being created to help manufacturers, network programmers, and entertainment companies shape their future offerings.  In exchange for your willingness to participate, there will be prizes and incentives awarded. ALL CONTACT INFORMATION WILL BE HELD IN STRICTEST CONFIDENCE AND WE WILL NEVER TRY TO SELL YOU ANYTHING.  You will be able to opt-out of the panel at any time.</p>
					Please click here <a href=http://65.19.137.17/nextresearch/nr.htm>http://65.19.137.17/nextresearch/nr.htm</a> if you would like to participate in your first survey and earn a chance to win one of 25 new Digital Video Recorders being awarded in September!  (You do not have to join the panel to participate in this survey.) This is a national market research program conducted with the highest ethical standards. Feel free to contact program director, Jennifer Choate at 901.491.4995 with any q

In [20]:
def email_to_text(email):
    html = None
    for part in email.walk():
        ctype = part.get_content_type()

        if not ctype in ('text/plain' , 'text/html'):
            continue
        try:
            content = part.get_content()
        except:
            content = str(part.get_payload())
        if ctype == 'text/plain':
            return content
        else:
            html = content
    if html:
        return html_to_plain_text(html)

In [21]:
print(email_to_text(sample_spam_email)[:100] + "\n ...")

  We thank you for just a moment of your time.  NextResearch is inviting you to join a panel of cons
 ...


In [22]:
# nltk stemmer module
try:
    import nltk
    
    stemmer = nltk.PorterStemmer()
    for word in ("Computaiotn" , "Computing" ,"Computed" , "Compute" , "Compulusive"):
        print("Stemmer word is ",stemmer.stem(word))
except Exception as e:
    print("Exception " + str(e) + "occurs")


Stemmer word is  computaiotn
Stemmer word is  comput
Stemmer word is  comput
Stemmer word is  comput
Stemmer word is  compulus


In [23]:
# url extractor
try:
    import urlextract
    
    url_extractor = urlextract.URLExtract()
    print(url_extract.find_urls("https://github.com/Rohit-33?tab=repositories and https://github.com/Rohit-33/Data-Science"))
except Exception as e:
    print(e)
    

name 'url_extract' is not defined


In [24]:
from sklearn.base import BaseEstimator , TransformerMixin

class EmailToWordCounter(BaseEstimator , TransformerMixin):
    def __init__(self,strip_headers = True , lower_case = True , remove_punctuation = True,
                replace_urls = True , replace_numbers  = True , stemming = True ):
        self.strip_headres = strip_headers
        self.lower_case = lower_case
        self.remove_punctuation = remove_punctuation
        self.replace_urls = replace_urls
        self.replace_numbers = replace_numbers
        self.stemming = stemming
    def fit(self , X  , y = None):
        return self
    def transform(self , X , y = None):
        X_transformed = []
        for email in X:
            text = email_to_text(email) or ""
            if self.lower_case:
                text = text.lower()
            if self.replace_urls and url_extractor is not None:
                urls = list(url_extractor.find_urls(text))
                urls.sort(key = lambda url : len(url) , reverse = True)
                for url in urls:
                    text.replace(url , " URL ")
            if self.replace_numbers:
                text = re.sub(r"\d+(?:\.\d*)?(?:[eE][+-]\d*)?","NUMBER" , text)
            if self.remove_punctuation:
                text = re.sub(r"\W+" , " " , text, flags = re.S |re.M|re.I)
            word_counts = Counter(text.split())
            if self.stemming and stemmer is not None:
                stemmed_word_counts = Counter()
                for word,counts in word_counts.items():
                    stemmed_word = stemmer.stem(word)
                    stemmed_word_counts[stemmed_word] += counts
                word_counts = stemmed_word_counts
            X_transformed.append(word_counts)
        return np.array(X_transformed)

In [25]:
X_few_train = EmailToWordCounter().fit_transform(X_train[0:3])
X_few_train


array([Counter({'number': 9, 'url': 1, 'http': 1, 'www': 1, 'newsisfre': 1, 'com': 1, 'click': 1, 'date': 1, 'numbertnumb': 1, 'bbc': 1, 'report': 1, 'donal': 1, 'macintyr': 1, 'win': 1, 'high': 1, 'profil': 1, 'libel': 1, 'case': 1, 'against': 1, 'polic': 1}),
       Counter({'ie': 4, 'linux': 4, 'waider': 3, 'i': 3, 'if': 2, 'in': 2, 'befor': 2, 'it': 2, 'that': 2, 'ilug': 2, 'you': 1, 're': 1, 'not': 1, 'doolin': 1, 'beg': 1, 'borrow': 1, 'or': 1, 'steal': 1, 'your': 1, 'way': 1, 'there': 1, 'the': 1, 'lbw': 1, 'folk': 1, 'depart': 1, 's': 1, 'far': 1, 'too': 1, 'much': 1, 'fun': 1, 'cheer': 1, 'just': 1, 'back': 1, 'ye': 1, 'is': 1, 'veri': 1, 'person': 1, 'of': 1, 'me': 1, 'we': 1, 'are': 1, 'fact': 1, 'well': 1, 'and': 1, 'truli': 1, 'doom': 1, 'she': 1, 'say': 1, 'leav': 1, 'now': 1, 'can': 1, 'probabl': 1, 'get': 1, 'a': 1, 'good': 1, 'head': 1, 'start': 1, 'they': 1, 'realiz': 1, 'm': 1, 'gone': 1, 'jami': 1, 'zawinski': 1, 'irish': 1, 'user': 1, 'group': 1, 'http': 1, 'www': 

In [26]:
from scipy.sparse import csr_matrix

class WordCounterToVector(BaseEstimator , TransformerMixin):
    def __init__(self , vocabulary_size = 1000):
        self.vocabulary_size = vocabulary_size
    def fit(self , X , y = None):
        total_count = Counter()
        for word_count in X:
            for word , count in word_count.items():
                total_count[word] += min(count , 10)
        most_common = total_count.most_common()[:self.vocabulary_size]
        self.vocabulary_ = {word : index+1 for index , (word , count) in enumerate(most_common)}
        return self
    def transform(self , X , y =None):
        rows = []
        cols = []
        data = []
        for row , word_count in enumerate(X):
            for word , count in word_count.items():
                rows.append(row)
                cols.append(self.vocabulary_.get(word , 0))
                data.append(count)
        return csr_matrix((data , (rows , cols)) , shape = (len(X) , self.vocabulary_size +1))

In [27]:
vocab_trans = WordCounterToVector(vocabulary_size= 10)
X_few_vocab = vocab_trans.fit_transform(X_few_train)
print(X_few_vocab.toarray())

[[ 19   9   0   0   0   0   0   0   0   0   0]
 [ 84   0   3   2   1   1   1   1   0   0   0]
 [341   5  14  12  10  14  16  18  15  13   9]]


In [28]:
vocab_trans.vocabulary_

{'number': 1,
 'i': 2,
 'in': 3,
 'you': 4,
 'the': 5,
 'of': 6,
 'and': 7,
 'to': 8,
 'my': 9,
 'thi': 10}

In [29]:
from sklearn.linear_model  import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

preprocess_pipeline = Pipeline([
    ('email_to_wordcount',EmailToWordCounter()),
    ('wordcount_to_vectors' ,WordCounterToVector()),
])

X_train_transformed = preprocess_pipeline.fit_transform(X_train)

In [30]:
model = LogisticRegression(solver = 'lbfgs' , max_iter = 1000 , random_state = 42)
score = cross_val_score(model , X_train_transformed , y_train , cv = 3 , verbose = 4)
print(max(score))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.988) total time=   0.2s
[CV] END ................................ score: (test=0.990) total time=   0.0s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] END ................................ score: (test=0.990) total time=   0.2s
0.99


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.7s finished


In [32]:
from sklearn.metrics import precision_score , recall_score

X_test_transformed = preprocess_pipeline.transform(X_test)

model.fit(X_train_transformed , y_train)

y_pred = model.predict(X_test_transformed)

print("Precession : {:.2f}%".format(100 *precision_score(y_test , y_pred)))
print("Recall :{:.2f}%".format(100 *recall_score(y_test , y_pred)))

Precession : 100.00%
Recall :94.92%
