# Spam Email Classification

In [21]:
# used for manipulating directory paths
import os
# Scientific and vector computation for python
import numpy as np
# Import regular expressions to process emails
import re
# Plotting library
from matplotlib import pyplot
# Optimization module in scipy
from scipy import optimize
from sklearn import svm
# will be used to load MATLAB mat datafile format
from scipy.io import loadmat
from collections import Counter
import utils
# tells matplotlib to embed plots within the notebook
%matplotlib inline

The dataset was taken from http://spamassassin.apache.org/old/publiccorpus/

In [22]:
#for raw emails
def process(email_contents, verbose=True):
    """
    Preprocesses the body of an email and returns a list of indices 
    of the words contained in the email.    
    
    Parameters
    ----------
    email_contents : str
        A string containing one email. 
    
    verbose : bool
        If True, print the resulting email after processing.
    
    Returns
    -------
    word_indices : list
        A list of integers containing the index of each word in the 
    """
    

    
    # Lower case
    hdrstart = email_contents.find(chr(10) + chr(10))
    email_contents = email_contents[hdrstart:]
    email_contents = email_contents.lower()
    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents =re.compile('<[^<>]+>').sub(' ', email_contents)
    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)
    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.compile('(http|https)://[^\s]*').sub(' httpaddr ', email_contents)
    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents)
    # Handle $ sign
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)
    # get rid of any punctuation
    email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},''">_<;%\n\r]', email_contents)
    # remove any empty word string
    email_contents = [word for word in email_contents if len(word) > 0]
    # Stem the email contents word by word
    stemmer = utils.PorterStemmer()
    processed_email = []
    for word in email_contents:
        # Remove any remaining non alphanumeric characters in word
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)

    if verbose:
        print('----------------')
        print('Processed email:')
        print('----------------')
        print(' '.join(processed_email))
    return processed_email

In [23]:
vocab = {}
count = 0
for filename in os.listdir('spam'):
    count += 1
    print(filename,"filenumber = ",count)
    file = os.path.join('spam',filename)
    if os.path.isfile(file):
        fid = open(file)
    else:
        pass
    try:
        email_contents = fid.read()
        processedContents = process(email_contents)
        for word in processedContents:
            if word not in vocab.keys():
                vocab[word] = 1
            else:
                vocab[word] += 1
    except:
        pass
print(vocab)
        

0132.7ac2141ed9a163f934ac65b3f59a2a03 filenumber =  1
----------------
Processed email:
----------------
milfhunt milf hunter do you know where your mom is more sampl pic nbsp nbsp nbsp nbsp more sampl movi nbsp nbsp nbsp nbsp list of milf nbsp click here to enlarg your peni number  number inch natur nbsp nbsp click here to be remov
0313.5126f820bf11ba460e2c1611cee632c1 filenumber =  2
----------------
Processed email:
----------------
 nextpart number number e number number a number c number d e number e number contenttyp text plain charset iso number  number contenttransferencod base number sgvyzsdzihrozsbob number r number zxn number ihroaw number nigluierwrhmui number vdyb number b number ugi number fu ig number ha number ugysbwzxjzb number hbcbiywnrdxancmnvchkgb number ygysbevkqgcmlnahqg b number bybdrc number sliagt number vyicjib number qiig number ldybzb number z number d number fyzsblyxnpbhkgdgfr zxmgew number ihrocm number z number gncnrozsbzdgvwcyb number bybtywtligegi numbe

----------------
Processed email:
----------------
thi is a multipart messag in mime format  nextpart number hzmysbwvsemnjin number kg number y contenttyp multipart altern boundari  nextpart number hzmysbwvsemnjin number kg number yaa  nextpart number hzmysbwvsemnjin number kg number yaa contenttyp text html charset big number contenttransferencod base number pgh number bwwgeg number sbnm number dj number idxjuonnjagvtyxmtbwljcm number zb number z number lwnvbtp number bwwidqp number bwxuczpvpsj number cm number c number nozw number hcy number tawnyb number nvznqty number tom number mzmljztpvzmzpy number uidqp number bwxuczp number psj number cm number c number no zw number hcy number tawnyb number nvznqty number tom number mzmljztp number b number jkig number keg number sbnm number imh number dha number ly number d number cudzmub number jn l number rsl number jfqy number odg number sndaipg number kdqo number agvhzd number ncjxtzxrhigh number dhatzxf number axi number q number udgvudc 

----------------
Processed email:
----------------
dear user do you ever wish you could easili call peopl you know in other countri for up to number less than standard call price and then to make these save without have to subscrib to ani low cost call servic we have now launch a product that doe exactli that you can now call peopl in most popular destin around the world for onli number cent per minut there ar no hidden charg you do not need to signup us ani credit card or pai ani extra bill you can try thi servic at no risk and choos to us it with no commit to us thi new servic simpli dial our access number number number number and onc connect dial the actual intern number you wish to call for more inform and the current list of countri you can call pleas check our websit httpaddr exampl if you want to call a german number number number number you would number dial number number number number number wait until you connect to our system and hear a messag ask you to dial the number you 

----------------
Processed email:
----------------
thi is a multipart messag in mime format  nextpart number number d number c number e number a number bd number e number contenttyp text plain charset window number contenttransferencod number bit ani wai you slice it ge lifetim protector is a cut abov underwritten by gener electr capit assur compani and first coloni life insur compani ge lifetim protector univers life insur cook the competit with competit premium male ag number best dollar number number face amount annual premium ge lifetim protectorsm lifetim premium number dollar number number csv dollar number ag number dollar number number  csv face amt ag number dollar number number  product j dollar number number  dollar number number  dollar number number  product i dollar number number  dollar number number  dollar number number  product l dollar number number  dollar number number  dollar number number  product s dollar number number  dollar number number  dollar number number

----------------
Processed email:
----------------
thi is a multipart messag in mime format  nextpart number number bc number b number number d number ccc number contenttyp multipart altern boundari  nextpart number c number abaf number ef number c number  nextpart number c number abaf number ef number c number contenttyp text plain charset window number contenttransferencod quotedprint dear sir madam wish you a wonder dai number e with an offer to save you monei and time number e shop from the conveni from home or offic number e window shop a new wai to go window shop number e our onlin super store offer over number qualiti product with number plu categori number e for you to window shop through number e number the follow ar ecoupon number which you can us with ani order number e y ou place from johnson home product onlin super store number e dure the check out process of our onlin secur shop cart system ord er form you will be prompt to enter in the ecoupon number number e which will

----------------
Processed email:
----------------
thi is a multipart messag in mime format  nextpart number number f number number c number b number number be number ed number contenttyp text plain charset window number contenttransferencod number bit save annuiti client from sink renew rate  save annuiti client from sink renew rate  save annuiti client from sink renew rate  no surrend charg if the annuiti renew below the bailout  number number lifetim bailout number number year number number number number base rate number commiss  number penaltyfre withdraw rate a excel by a m best for financi strength  number year surrend charg not a number tier annuiti  no annuit requir call todai for more inform on the loyal integritysm vision number annuiti number  number  number or pleas fill out the form below for more inform name    email   phone   citi   state     fairlan financi corpor the contract base interest rate must fall more than number basi point below the initi base interest rate ef

----------------
Processed email:
----------------
 nextpart number number e number number e number e number e b number d number contenttyp text plain charset iso number  number contenttransferencod base number qvruru number usu number ooibuaglziglzigegtvvtvcbmb number igquxmienvbxb number dgvyifvz zxjzisehdqonciporvctu number bly number lhbcbqywnrywdlierlywwhkg number kdqpob number j number b number gu number lzdgvtv number ya number mgmjawmibtb number z number d number fyzsbtdwl number zsatuhjvzmvzc number lv bmfsievkaxrpb number tdqoncklui number x number zgvzifnpecatifllcya number isatiezlyxr number cmutugfja number vkifv number awxpdgllcw number kquxmiezvciaxifnwzwnpywwgt number xifbi awnliq number kdqpuaglzifnvznr number yxjlifdpbgw number dqotifbyb number rly number qgew number cibj b number wdxrlcibmcm number tihvud number fudgvkigfuzcboyxphcmrvdxmgdmlydxnlcw number k lsbizwxwihnli number vyzsb number b number vyihbyaxzhdgugjib number ywx number ywjszsbpbmzvcm number h dglvbg n

In [24]:
for filename in os.listdir('easy_ham'):
    count += 1
    print(filename,"filenumber = ",count)
    file = os.path.join('easy_ham',filename)
    if os.path.isfile(file):
        fid = open(file)
    else:
        pass
    try:
        email_contents = fid.read()
        processedContents = process(email_contents)
        for word in processedContents:
            if word not in vocab.keys():
                vocab[word] = 1
            else:
                vocab[word] += 1
    except:
        pass
print(vocab)
        

1226.7d2cdef974037d299c744b765e792641 filenumber =  493
----------------
Processed email:
----------------
begin pgp sign messag hash sha number contenttyp text plain charset usascii in messag hal devor write on mon number sep number ted ted cabeen wrote ted here the code for everybodi and the list archiv in ted case anyon want it in the futur veri cool i vote for thi be ad to cv ani object not from me although you probabl want to chang the name of the procedur   ted cabeen httpaddr emailaddr check websit or keyserv for pgp gpg kei ba number d number emailaddr i have taken all knowledg to be my provinc f bacon emailaddr human kind cannot bear veri much realiti t s eliot emailaddr begin pgp signatur version gnupg v number number number freebsd comment exmh version number number number number number id number dbqe number mk number oayjflodsdira number akdyeodgfygsuldyusalxjtbcspxuqcdhafg z number p number fqmrjlqcbvmkfjmngji dc number l end pgp signatur exmhwork mail list emailaddr httpa

----------------
Processed email:
----------------
on fri number sep number russel turpin wrote you seem not to know what a poor man divorc is i know veri littl in gener i hope you can excus me for that it is an old term from the time when divorc wa difficult but walk wa easi and ident wa not so lock down as it is todai not everi widow had a dead husband yeah you could alwai run awai strangl your wife your wife could alwai poison you scoobi doobi doo it wasnt the rule and i dont feel like desintegr into a nitpick orgi you win im see lack of innov that doesnt tell us anyth except what is happen in eugen leitl life the more common yeah i happen to live in a small hole under the root of an old oak tree you dont so innov is a global phenomenon observ is that the rate of chang is increas do you have ani data that might persuad us that what you see is more tell than what other see gerontocraci favor gerontocraci i would have thought that gerontocraci favor biotech research and plenti of youn

----------------
Processed email:
----------------
iv got a test set here that the last number and a bit year email to emailaddr and emailaddr  it a realli ugli set of number number messag current broken into number number spam number number ham number number current unclassifi these address ar all over the number some differ ekit ekno isiconnect websit so thei get a lot of spam as well as the usual spam it also ha custom complain about credit card charg it ha peopl interest in the servic and ask question about long distanc rate c c c lot and lot of commerci speech in other word stuff that sa get pretti badli wrong im current mangl it by feed all part text html whatev els into the filter as well as both a select number of header to from contenttyp xmailer and also a list of header count of header thi is show up some nice stuff  e g the xuidl that stoopid spammer blindli copi into their messag i did have receiv in there but it out for the moment as it caus rate to drop im also strip out

----------------
Processed email:
----------------
 begin forward text statu ro date mon number sep number number number number  number to emailaddr from phil duncan subject the war prayer sender emailaddr replyto phil duncan the follow prayer is from a stori by mark twain and wa quot by lewi laphan in the octob issu of harper magazin it occur at the veri end of an excel articl which i recommend to you in the stori an old man enter a church where the congreg ha been listen to an heroic sermon about the glori to be won in battl by young patriot arm with the love of god he usurp the pulpit and prai the follow o lord our god help us to tear their soldier to bloodi shread with our shell help us to cover their smile field with the pale form of their patriot dead help us to drown the thunder of the gun with the shriek of their wound writh in pain help us to lai wast their humbl home with a hurrican of fire help us to wring the heart of their unoffend widow with unavail grief help us to turn 

----------------
Processed email:
----------------
begin pgp sign messag hash sha number thug of south boston and the reveng of the bandit princess the geodes economi robert a hettinga sundai august number number boston when you think about it on wai the fbi winter hill vs patriarcha angiulo cosa nostra fight wa just anoth race war between thug put crude and at it most racist the fbi and the winter hill gang were the mostli irish thug and patriarcha famili were of cours the mostli italian thug think scorses upcom gang of new york onli with counterreformatori overton hoover south boston socialclub putsch start in the mid number s wa particularli audaci in hindsight the u s feder govern actual decid to underwrit a revers of the prohibitionera captur of the nation racket by the italian from the irish the fact that the plot wa hatch not for new york but for south boston the most irish place in the us onli make even more gigant the big lie that wa told by the fbi to it ostens polit master a

----------------
Processed email:
----------------
bill stoddard wrote gab the problem is that polit have gotten so muddi gab nowadai that shout down and unpeac disrupt gab polit ralli that you dont agre with ha becom gab common practic the court have constantli rule gab that there ar some restrict on the first amend gab thei teach you that your veri first year of law school ill agre with owen on thi on muddi my ass how hard is it to chose between a republocrat or a demipublican not veri shout down ha grown to becom the answer becaus the govern over a span of year and with the help of the court ha limit the right we have as citizen under the first amend wish think peopl ar just bigger dickhead now cultur is chang and it is becom accept to get in peopl face and shout them down when you disagre with them the peopl that do thi ar not disenfranchis thei get their rock off on be disagre asshol the act of protest is more import than the actual issu be protest for most of these peopl in my ex

----------------
Processed email:
----------------
of the three ly politician which liar would you take  origin messag  from john hall to fork sent mondai septemb number number number number pm subject re goodby global warm i did not have sex with that woman origin messag from emailaddr emailaddr on behalf of mr fork sent mondai septemb number number number number pm to fork subject re goodby global warm  origin messag  from john hall a green onc said that if the spot owl hadnt exist thei would have had to invent it a republican onc said i am not a crook
2539.466ecd08a54bf64e857522423a33108a filenumber =  847
----------------
Processed email:
----------------
url httpaddr number number number date number  number  number t number number number number number world latest warn to next pm as pakistan goe to the poll
1540.e6859a3f2b7d4347f84df81b2398ae58 filenumber =  848
----------------
Processed email:
----------------
uhn qo number qbuvplonb contenttyp text plain charset usascii content

----------------
Processed email:
----------------
i just set up razor and spamassassin but i keep get thi error in my mail log file razor number check skip no such file or directori cant call method log on unbless refer at usr local lib perl number site perl number number number razor number client agent pm line number i have look through the archiv list and the onli thing i have seen about thi error is a possibl permiss problem on the log file i did what it said in the archiv basic chang the permiss on the file but it still no go ani other help would be appreci mayb im miss someth someth i forgot to run or do rob  thi sf net email is sponsor by osdn  tire of that same old cell phone get a new here for free httpaddr number refcod number vs number razorus mail list emailaddr httpaddr
1168.624b64b63fb0dcf81b58856f0618c3a0 filenumber =  959
----------------
Processed email:
----------------
onc upon a time peter wrote i start wonder how doe apt react when it find a newer kernel in the bu

----------------
Processed email:
----------------
on number sep number richard bartlett wrote richard i have a custom who is develop some printer driver code to allow custom driver set nup booklet duplex etc to be save up to the server to be retriev by other user the data is be written by a printer driver us the log on user authent to a registri kei hklmsystemcurrentcontrolsetcontrolprintenvironmentswindow nt x number driversvers number  driver name  custom kei subkei let me get thi straight a registri kei is load from the server onto the client workstat who can modifi it then write it back onto the server own registri  which is not go to us it the question is what ar the secur risk of allow user to write to thi kei the data is string data in the form of delimit numer valu thi data is then retriev by capabl printer driver and interpret the risk as i see it ar twofold number the risk of a compromis to the server us thi registri kei i think thi is unlikelei as the server itself doe not 

----------------
Processed email:
----------------
on tue number aug number jon wrote i ha just been given an old toshiba cs number with earliest pentium and number mb of hd but onli a floppi drive on it it got win number number which is funni to see again but gonna be clear as soon as i stop mess with it what i wa wonder wa could anyon advis what o s would be good for thi i want a small usabl nix distro for it that i can transfer to it fom floppi connect thi to winblow i know that winblow allow piertopi connect over serial and parellel port to other winblow but is thi easi do for connect winblow to nix have done exactli thi with debian onli i us a pcmcia network card and did it off ftp esat net ucd bandwidth is rather good howev if youv anoth machin look into thi null modem cabl jobbi httpaddr number cs html whether you can connect that to direct cabl connection no idea you could howev put the file onto window boot onto a ilug bbc tm and mount the fat number partit easier quicker wai 

----------------
Processed email:
----------------
url httpaddr number number number date number  number  number t number number number number number some albino rhino
2228.6a8e7296030921e6cc2cdbea41212577 filenumber =  1100
----------------
Processed email:
----------------
url httpaddr number number number date number  number  number t number number number number number abc net au
1172.bdc831d97e06b2539209e0ad04e671e9 filenumber =  1101
----------------
Processed email:
----------------
do i need to do anyth to recreat anyth after delet thi i did notic an rpm i made the other dai didnt work and just sat there for ag seemingli do noth which probabl did thi ok now i get fetch number number kb in number m number s number b s error cannot get exclus lock on var lib rpm packag error cannot open packag index us db number  oper not permit number e could not open rpm databas cannot open packag index us db number  oper not permit number arrrrg on tuesdai februari number number number number n

----------------
Processed email:
----------------
gari lawrenc murphi cynic hmmm just as i thought in other word it ha no practic us whatsoev tourism is the world largest industri us thi to preview your travel or figur out where you ar would be veri valuabl onlin game continu to grow screw britannia reallif britain would be a fun world to wander conquer explor virtual in roleplai or real timestrategi game and of cours as jame roger point out it an ideal displai substrat for all sort of other overlaid data map ar great photrealist number d map of everywher which can have mani other static and dynam dataset overlaid ar spectacular combin those last two thought consid the static world map in fade color with patch hereandther cover by live webcam stitch over the static info in bright color itd be like the fog of war view in game like warcraft over the real world  gordon
0492.3aa3aaa0ac9343fd1aa11864e8c281b1 filenumber =  1194
----------------
Processed email:
----------------
on wed numbe

----------------
Processed email:
----------------
on thu sep number number at number number number pm  number leland woodburi wrote i found a nice littl perl script for thi purpos call rotat which make the process of rotat log file veri simpl if there an offici sourc for thi script i couldnt find it my host provid pair com ha it instal and that where i found it howev redistribut appear to be allow so iv attach it thank for the script it also appear that the standard logrot tool includ with mani system or at least redhat system will support wildcard when rotat file so someth like home razor razorag log can be specifi dave  thi sf net email is sponsor by osdn  tire of that same old cell phone get a new here for free httpaddr number refcod number vs number razorus mail list emailaddr httpaddr
2467.5c3cff250c85a50dd75b0b9e1ffaac3f filenumber =  1213
----------------
Processed email:
----------------
url httpaddr number date not suppli art and letter daili a wonder and dens blog ha fold up

----------------
Processed email:
----------------
matthia saou wrote id like to ak thi on the rpmzzzlist would a new depend of number k the alsalib packag for mani packag mplayer ogl xine be a problem for the freshrpm net packag user as i realli feel like blend alsa in now especi sinc iv just spent some time recompil alsakernel packag for all the psych kernel i dont know a lot about alsa but us custom kernel for mani of the machin  would thi mean in order to get mplayer eg to work id first have to compil a custom kernel option or kernel modul sinc i wouldnt have your alsakernel instal would that mean alsalib and so on wont instal i guess in short if it requir the ship kernel to be us id be against withalsa as a default te  troi engel system engin cool as the other side of the pillow rpmlist mail list httpaddr
1056.40e105e66c28374d4fffa209a7176959 filenumber =  1288
----------------
Processed email:
----------------
i us exmh number number with procmail for presort incom mail and move 

----------------
Processed email:
----------------
greg in case it wasnt obviou im a strong propon of filter greg junk mail as earli as possibl ie right after the smtp data greg command ha been complet filter spam at the mua just seem greg stupid to me  by the time it get to me mua the spammer ha greg alreadi stolen my bandwidth the two problem i see with filter that earli ar number everyon receiv email via that server will contribut ham to the stew make the bayesian classif less effect number given that there will be some fals posit you absolut have to put the mail somewher you cant simpli delet it i also dont like the tmdaish busi of repli with a msg that sai here what you do to realli get your messag to me that put an extra burden on my correspond as an individu i would prefer you put spammish messag somewher where i can review them not an anonym sysadmin who i might not trust with my person email noth against you greg  i person prefer to manag thi stuff at the user agent level band

----------------
Processed email:
----------------
man threaten explos in moscow thursdai august number number number number pm moscow ap  secur offic on thursdai seiz an unidentifi man who said he wa arm with explos and threaten to blow up hi truck in front of russia feder secur servic headquart in moscow ntv televis report the offic seiz an automat rifl the man wa carri then the man got out of the truck and wa taken into custodi ntv said no other detail were immedi avail the man had demand talk with high govern offici the interfax and itartass new agenc said ekho moskvi radio report that he want to talk with russian presid vladimir putin polic and secur forc rush to the secur servic build within block of the kremlin red squar and the bolshoi ballet and surround the man who claim to have on and a half ton of explos the new agenc said negoti continu for about on and a half hour outsid the build itartass and interfax report cite wit the man later drove awai from the build under polic es

----------------
Processed email:
----------------
on saturdai septemb number number at number number pm gari lawrenc murphi wrote although it like a total shock to number number number nine of all the emploi websit design out there the truth is webform can accept u s of a as a countri incred but true web form can also accept multipl or even freeform telephon number and can even be partit into manag step all thi can also be done without sell exclus right to your wallet to the world secondrichest corpor assum cisco is still number and vendor lock your busi into their small transact fee tith ye but thi is what normal happen engin we can put an input valid parser on the backend to do that design there a javascript librari that can do some of the prevalid creativ director i want it in blue with a zoom logo engin can we get to that later we need to meet function spec creativ director you dont understand i want it in blue creativ director oh and the site launch thi fridai becaus i sent out a

----------------
Processed email:
----------------
on mon number  number  number at number number matthia saou wrote onc upon a time alvi wrote thank i seem to be have problem with rebuild transcod from src it get confus from previou error and give up could you give the output of the error thi is onli the last part of it i us rpmbuild rebuild without avifil transcod number number number fr number src rpm af number decor cpp number waveformatex is us as a type but is not defin as a type af number decor cpp number pars error befor if af number decor cpp number syntax error befor   token af number decor cpp number wvfmt wa not declar in thi scope af number decor cpp number avm wave format name wa not declar in thi scope af number decor cpp number wvfmt wa not declar in thi scope af number decor cpp number wvfmt wa not declar in thi scope af number decor cpp number wvfmt wa not declar in thi scope af number decor cpp number iso c forbid declar of fprintf with no type af number decor cpp nu

----------------
Processed email:
----------------
updat of cvsroot spamassassin spamassassin mass in directori uswprcv number tmp cvsserv number mass modifi file tag b number number number masscheck log messag fix addit to masscheck index masscheck rc file cvsroot spamassassin spamassassin mass masscheck v retriev revis number number retriev revis number number number number diff b w u d r number number r number number number number  masscheck number aug number number number number  number  number number masscheck number aug number number number number  number  number number number number  number number number number print masscheck result from emailaddr on dollar whenn print m sa version dollar spamtest version n print  cv tag dollar name dollar  n dollar iter set function  want dollar iter run emailaddr exit  number number number number my dollar test dollar statu get name of test hit dollar test join   sort split dollar test dollar id  s s g printf s number d s sn  dollar yorn y   

----------------
Processed email:
----------------
us perl daili headlin mailer dyndn org offer free dn to perl site post by km on tuesdai septemb number number number new httpaddr number number number number copyright number  number pudg all right reserv you have receiv thi messag becaus you subscrib to it on us perl to stop receiv thi and other messag from us perl or to add more messag or chang your prefer pleas go to your user page  httpaddr you can log in and chang your prefer from there
0588.cb6d96aaef48d53bc690e780276d6595 filenumber =  1650
----------------
Processed email:
----------------
at number number am  number on number number number gordon mohr wrote of cours thi sai veri littl almost noth about the overal popul behavior gai or straight and the rel preval of number k individu in either group but it doe strongli suggest that gai male with number k partner exist in measur number so peopl should stop treat eugen anecdot estim as if it were sheer fantasi bitbitch own citat 

----------------
Processed email:
----------------
onc upon a time vill wrote thank a lot the rpm seem to be fine thei work for me out of the box on vanilla valhalla w latest errata except that i dont see an init script in the rpm a sampl on design for rh is suppos to be in util alsasound could you take a look if it can be includ it doesnt need to as red hat linux alreadi set correct permiss on all alsa audio devic for local log in user through the consol perm file and the modul conf file take care of load the right modul on demand also aumix and the script that come with red hat linux still work for control the volum so it still save and restor when the comput is halt even us alsa im glad you got your card work with these im now wonder if i wont mayb bui an amplifi that support dolbi digial decod my current on onli doe pro logic sinc iv read that alsa support the s pdif optic output of the sound chip of my shuttl httpaddr from what i can tell after onli number dai us it alsa rock espe

----------------
Processed email:
----------------
relat anecdot i wa eat in a restaur in chinatown in boston the place wa empti the onli other custom wa a white gui read an asianlanguag newspap the gui ask the waiter for help translat a word eventu hi stori came out he had marri an asian woman through on of these introduct servic after about a year of marriag she had charg him with assault and left him leav no contact inform he wa hang around in chinatown ask random asian for help find her i obvious dont know if he did assault her but what struck me wa that the possibl of mutual exploit is high anecdot number two in colleg i had a job as a street vendor there wa a gui i work with who wa a lifer in the job he wa a notic mess up gui among other odd characterist he fawn on women custom do stuff like offer them flower i ask him about it he said that hed never had sex with a woman who wasnt a prostitut and hi dream wa to save up enough monei to get a mail order bride i wa realli move the g

----------------
Processed email:
----------------
on fri number sep number wendi wendi p robert wrote wendi iv been try to set a button call which wendi automat forward mail us a form mycomp wendi without go through the editor but so far havent got wendi the right recip i have on that us dist to send stuff from my work mail to my home mail the bind look like thi set bind kei hd dist silent form distcomp tome the hd dist silent is a hack up version of the proc that doe dist in exmh and is past in below it get old and probabl should be resync with the current code but it still work on a rel recent cv copi of exmh it also possibl that there an easier wai but i hack thi togeth quickli a year or more ago and when it work i move on to other task hal proc hd dist silent arg global exmh msg set exmh ctype dist if string length dollar arg number set arg mh distsetup if msgok dollar msg id m if string compar info command dollar arg dollar arg number  old interfac with hook procedur  if catch do

----------------
Processed email:
----------------
dan kohn write gui the habea infring list hil exist explicitli to deal with spammer while were get judgment against them and especi in other countri where those judgment ar harder to get my concern doesnt stem from fail to understand how your busi is intend to work my concern is the lack of empir evid that it will reduc the amount of uncaught spam pleas note that nobodi ha ever had an incent befor to go after regular spammer ye some attornei gener have prosecut blatant pyramid scheme and isp have won some theft of servic suit but the vast major of spammer go forward with out ani legal hassl so i cant understand how daniel can assert that you cant track spammer down when it never realli been tri pleas dont misquot me i did not assert that you cant track spammer here is what i said  it will be difficult to find prosecut and win monei from someon in  variou nonfriendli countri where spam origin china is a good  exampl even if thei do offi

----------------
Processed email:
----------------
hi all i have a question which is a bit tricki and wa wonder of anyon ha come across thi problem befor or could point me in the right direct i am involv in port a sco unix applic to linux and we have encount a problem with the wai semaphor ar be handl the applic us mulitpl process to run applic code with the main process known as the bsh which control all i o be it screen or file i o syncronis is handl via semaphor in certain circumst the main process and the applic child process seem to lock up both wait for the syncronis semaphor to chang state i have attach ddd to the process and it seem that the semaphor code is do the correct thing for syncronis but the process stai stuck in the semop system call i have also notic that if i introduc a slight delai between chang semaphor state the problem goe awai but thi caus our entir applic to run realli sloooww lol is there anyth weird or differ with the standard implemen of semaphor on modern 

----------------
Processed email:
----------------
us lm number is yield an extra number a dai but it get fals posit where it shouldnt such as an email with a word doc and the signatur below after look at the word doc direct to the sender cabin i am convinc it mark the bodi which contain no next except the incredimail advertis signatur as spam so i have to turn off lm number razor ha been get other strang email it shouldnt with lm number on see the incredimail ad signatur i am talk about below fox incredimail  email ha final evolv  click here  thi sf net email is sponsor by thinkgeek welcom to geek heaven httpaddr razorus mail list emailaddr httpaddr
0283.469fc9946c6d920af042b022bd63a2f9 filenumber =  2036
----------------
Processed email:
----------------
even standard  number august number deft us of fortean unit of measur in number nd para  ma lost penguin found aliv by charl miranda a coloni of emperor penguin which wa thought to have starv to death in antarctica ha been found aliv

----------------
Processed email:
----------------
url httpaddr number html date not suppli were try to decid if fogbugz number number number should support custom field histor i am oppos to custom field in principl becaus thei get abus peopl add so mani field to their bug databas to captur everyth thei think might be import that enter a bug is like appli to harvard end result peopl dont enter bug which is much much wors than not captur all that inform you can alwai page fault to get the inform if the origin report forgot it rather than have a field in everi bug where you enter the version number of everi dll on your machin thi is an actual custom request inform which is like to be relev onli for a tini percentag of bug why not just have the programmerassigne look at the bug first and if thei think it might be dllversionrel onli then bounc the bug back to the origin ask for the dll info similarli it alwai tempt to add a field in which you ask for the os version in which the bug occur t

----------------
Processed email:
----------------
a world where some live in comfort and plenti while half of the human race live on less than dollar number a dai is neither just nor stabl absolut correct perhap the most fundament thing to realiz about life on earth todai the follow is a fascin document of offici govern polici that bear close read it is the aspir of a wonder nation in an imperfect world the war on terror is not a clash of civil it doe howev reveal the clash insid a civil a battl for the futur of the muslim world thi is a struggl of idea and thi is an area where america must excel i wa recent at a lectur about the surpris success of radio sawa our new musicandnew channel for number  number year old arab it number in practic everi market it enter near number listenership in amman and it even begin to be trust for new well past bbc and take share from everi other govern broadcast it is as hard to imagin america lose a war of idea in the longterm as it is to imagin americ

----------------
Processed email:
----------------
url httpaddr number html date number  number  number t number number number  number number thank to an anonym helper iv mirror all number iso imag for red hat number number ohio  httpaddr number ds number connect via sprint california  httpaddr number multipl gigabit connect now be nice 
2130.195623b2a5a8f7c796ed92d031471e22 filenumber =  2236
----------------
Processed email:
----------------
url httpaddr number number number date number  number  number t number number number number number orlando sentinel
0877.62f5636ba5885d1b92423169c83a35b9 filenumber =  2237
----------------
Processed email:
----------------
bill stoddard wrote wish think peopl ar just bigger dickhead now nah theyv alwai been thi wai cultur is chang and it is becom accept to get in peopl face and shout them down when you disagre with them cultur chang yeh those who fail to learn from histori  what doe it tell you when a market chain is name bread and circu and noo

----------------
Processed email:
----------------
joseph s barrera iii wrote chri haun wrote a lifegem is a certifi high qualiti diamond creat from the carbon of your love on as a memori to their uniqu and wonder life why wait until your dead im sure there enough carbon in the fat from your typic liposuct job to make a decent diamond  joe oh hell  what about excrement id love to be abl to sai  no the sun doesnt shine out of my ass but there the occasion diamond  owen httpaddr
1914.d2230228f70b1889cc9e23ac921bb969 filenumber =  2317
----------------
Processed email:
----------------
us perl daili headlin mailer mail list judo movi avail post by gnat on wednesdai octob number number number new httpaddr number number number number confer present judo movi avail post by gnat on wednesdai octob number number number new httpaddr number number number number copyright number  number pudg all right reserv you have receiv thi messag becaus you subscrib to it on us perl to stop receiv thi and ot

----------------
Processed email:
----------------
on tue number aug number number number number  number est jai lake wrote second on could make the assumpt that ancient or futur civil would not be hydrocarbon base there ar altern fuel sourc includ seab methan biomass and all the usual suspect  solar hydro etc some of these could be exploit on a decidedli lowtech ie emerg civil basi howev it is difficult to conceiv of an industri civil that doesnt emploi wheel axl and bear all of which requir lubric im not an engin robin anyon but it my understand that veget lubric break down under stress and that oil or graphit lubric ar the onli reason choic for high temperatur high rotat applic at least prior to extrem advanc mode of chemic synthesi thi is a good point there ar a lot of altern to hydrocarbon product deriv from petroleum but these have often been develop as a replac for petroleum after the technolog ha been establish  there is a grow industri in plantderiv plastic and lubric but thi 

----------------
Processed email:
----------------
british scientist were honour for research that found ostrich becam more amor with each other when a human wa around in fact ostrich eventu start put the move on human thi is true of manate also you dont want to know origin messag from emailaddr emailaddr behalf of carei sent fridai octob number number number number pm to emailaddr subject headlin  navel gaze win an ig nobel greet carei want you to know about a stori on www theag com au person messag ah the ig nobel alwai worth a read if onli thei had a catmood decipher   navel gaze win an ig nobel by jai lindsai boston octob number number url httpaddr number number number number html
2145.c986f253379064227829b00592359c15 filenumber =  2459
----------------
Processed email:
----------------
url httpaddr number number number date number  number  number t number number number number number it wasnt just the lack of welli that made thi protest differ  it wa the mix of the marcher write eu

----------------
Processed email:
----------------
if you need to store a databas password then clearli the first step is to store the text outsid the web tree you can encrypt it and store the encrypt kei elsewher so that at least an attack ha to get two differ thing also dont get full privileg  creat a user account that is grant veri limit access howev you can often do better than thi if secur is critic creat a separ program which ha these databas kei as note abov and make the web program contact it creat a veri limit protocol that onli let you do the oper you need you can add specif oper later there a perform hit which your trade for improv data isol giorgio zoppi wrote on fri aug number number david wheeler wrote the standard wai to store password is not to store password instead store a salt hash of the password in a databas when you get a purport password you resalt it comput the hash and determin if thei ar the same thi is how unix ha done it for year you want bigger hash and sal

----------------
Processed email:
----------------
on mon number aug number the voic made certaintytech write recent i have been receiv spam where both the from and to address ar set to my address therefor get thru due to awl ani suggest on thi on i could turn off awl but it doe have advantag btw i us a sitewid awl us cron to remov your address from the awl daili  toni  per scientiam ad libertatem through knowledg toward freedom genom kunskap mot frihet c number  number emailaddr perl eprint dollar dollar for sort lynx dump svanstrom com t  thi sf net email is sponsor by osdn  tire of that same old cell phone get a new here for free httpaddr number refcod number vs number spamassassintalk mail list emailaddr httpaddr
1408.c202263092b223a607078977ed7aa6c3 filenumber =  2609
----------------
Processed email:
----------------
i think with the confus and all we should give these folk until the end of the dai to get fix log upload justin on a relat note  i didnt includ the spamtrap stuff fr

----------------
Processed email:
----------------
thi articl from nytim com ha been sent to you by emailaddr anoth remind that the moral writ of intellectu properti is  and ought to be more limit than real properti privat monei paid for these bit but expropri mai be fairer for ip than realp rohit emailaddr vast detail on tower collaps mai be seal septemb number number by jame glanz and eric lipton what is almost certainli the most sophist and complet understand of exactli how and why the twin tower of the world trade center fell ha been compil as part of a larg secret proceed in feder court in lower manhattan amass dure the initi stage of a complic insur lawsuit involv the trade center the confidenti materi contain data and expert analysi develop by some of the nation most respect engin mind it includ comput calcul that have produc a seri of threedimension imag of the crumpl insid of the tower after the plane hit help to identifi the sequenc of failur that led to the collaps an immens

----------------
Processed email:
----------------
hi kragen thi is an interest analysi i think that there ar a coupl of nit i might pick for exampl i dont expect that the market will be well develop with highest bidder for while i think that the most import issu which is that end user wont be abl to fix their system is almost pass over i know that you know thi and you allud to it but your essai is get pass around so you might want to add to it bit about the sysadmin and other there on other point which you dont make which i think is veri import which is that research into defin and address class of vulner cant happen without librari of avail vulner code i can think of three research into autom method for address vulner who gripe uninvit about the qualiti of the exist vulner site do research into a set requir that you have enough exampl in the open that you can defin a set and that the set is ad to from time to time so you can make and test predict i feel fairli confid in sai that with

----------------
Processed email:
----------------
url httpaddr number number number date number  number  number t number number number number number img httpaddr aintitcoolnew
1701.39d6d3507aa62320295032f0c0e4435c filenumber =  2974
----------------
Processed email:
----------------
date mon number sep number number number number  number edt from dayv gastonguai i just instal razor number number on a freebsd number number releas box and have problem with razorcheck ani time razorcheck is run with or without argument i get thi error cant us an undefin valu as a symbol refer at usr local lib perl number site perl number number i number freebsd razor number client agent pm line number try instal the latest perl at least number number number port on freebsd and make sure you set the system to us perl from port i e in the port lang perl number file directori run us perl port reinstal the relev perl modul need by razor and try again sven  thi sf net email is sponsor by thinkgeek welcom to g

----------------
Processed email:
----------------
fnm number lrgftv number gum contenttyp text plain charset usascii contentdisposit inlin contenttransferencod quotedprint on fri sep number number at number number number pm  number vernon wrote as i understand it dn a record ar us in a rotat fashion for load balanc but dn mx record ar us in order or proriti mean the number befor the number and onli number if the number isnt avail that the theori ye but onli some of the mail is actual be scan which lead me to believ that not all of the mail is actual hit that box and the number never goe down why have i got someth confus here no but either due to some technic glitch or downright just want to do so peopl send to the secondari it a semiusu spammer trick actual to bypass the main server and send directli to a secondari sinc it will either have less filter or be trust or mx record in the mail world ar all explain in rfc number section number multipl mx record contain a prefer indic that mu

In [25]:
for filename in os.listdir('hard_ham'):
    count += 1
    print(filename,"filenumber = ",count)
    file = os.path.join('hard_ham',filename)
    if os.path.isfile(file):
        fid = open(file)
    else:
        pass
    try:
        email_contents = fid.read()
        processedContents = process(email_contents)
        for word in processedContents:
            if word not in vocab.keys():
                vocab[word] = 1
            else:
                vocab[word] += 1
    except:
        pass
print(vocab)
        

0034.3b223f26d69f641451a5d95faa9cb564 filenumber =  3044
----------------
Processed email:
----------------
 click here to read thi newslett on the web for free httpaddr number number number html dont worri about anyth els below thi paragraph it for those with html email client just go to the abov list web address to view thi issu perfectli in your web browser thank again  number number number lockergnom penguin shell nbsp number number number penguinreport career servic from lockergnom and dice com weve team up with dice com to bring you a full servic i t career enhanc solut whether you ar look for your dream job or try to hire talent peopl the fullfeatur career resourc center is the place to start find your it talent solut todai iv just got a few quick thought todai first the search is underwai for the host solut mention in mondai penguin shell the respons ha been great with more than two dozen offer of web space for the foundat for children and youth with diabet it gratifi to see ou

----------------
Processed email:
----------------
thi is a multipart messag in mime format  nextpart number number number c number ca number b number contenttyp text plain charset iso number jp contenttransferencod number bit opentext dollar b r b  dollar b number kelmm b emailaddr number  b  dollar b number b number emailaddr number  b  dollar b dollar dollar k number j number h dollar k e ev dollar j  dollar k number h dollar l dollar n number dollar k dollar d dollar f dollar o b number dollar number dollar ke ev dollar j  dollar k number h s dollar k number d dollar jev dollar f dollar dollar b dollar n dollar h b  dollar b dollar number dollar  dollar number  b  dollar b dollar dollar k  dollar k number h s dollar r number dollar k dollar h e iu dollar n number dollar k dollar dollar k dollar h dollar dollar j number hll dollar i dollar l dollar n number ul j p dollar n dollar number dollar lj dollar k dollar d dollar f number emailaddr number dollar k b  dollar b dollar  dollar 

----------------
Processed email:
----------------
begin pgp sign messag new from the libertarian parti number virginia avenu nw suit number washington dc number world wide web httpaddr for releas juli number number for addit inform georg getz press secretari phone number number  number ext number email emailaddr thousand spent on stripper golf membership show pentagon spend is out of control libertarian sai washington dc  quiz question which of the follow item have been charg to the taxpay recent by militari personnel wield governmentissu credit card a dollar number number for lap danc at strip club near militari base b dollar number number for a sumo wrestl suit and dollar number number for halloween costum c dollar number number for close cost on a home and dollar number number for a corpor golf membership d dollar number number for white beach sand and dollar number number worth of decor river rock at a militari base in the arabian desert e all of the abov incred the answer is all 

----------------
Processed email:
----------------
 click here to read thi newslett on the web for free httpaddr number html to unsubscrib simpli repli to thi email messag to resubscrib visit our site httpaddr dont worri about anyth els below thi paragraph it for those with html email client just go to the abov list web address to view thi issu perfectli in your web browser thank again  number lockergnom window daili nbsp number number number gnomereport get window xp tip month after the releas of window xp profession and home edit peopl ar still learn the nuanc of microsoft newest oper system whether youv been us it for a while or ar plan to upgrad to the new os soon thi ebook is right up your allei get your copi of lockergnom top number window xp tip now i dont go to mani concert but that doesnt mean i dont appreci listen to good music at everi wake moment tonight a hand of bai gnomi and i watch thei might be giant rock the fillmor melani steve brent missi robert patrick gretchen mat

----------------
Processed email:
----------------
thi is a multipart messag in mime format  nextpart number number number c number a number number a number contenttyp text plain charset iso number  number contenttransferencod quotedprint   number   number   number number  number number number videoclipstream player pleas click on the imag to plai the trailer number  number number number number   number number  go out and see a movi tonight   number number number number  number number number number number   number number  enter for your chanc to win   number number number number   number number  jame bond  number number number number  number number   number number  wampol dissolv evolv  number number number number  number number   number number  the adventur of pluto nash  number number number number  number number   number number  spiderman  number number number number  number number   number number  men in black  number number number number  number number   number number  citi by the

----------------
Processed email:
----------------
 cnet shopper newslett electron edit                                                  shopper  all cnet  the web  nbsp          number  soni cyber shot dscf number    number  canon powershot s number    number  palm m number    number  palm i number    number  nikon coolpix number    nbsp  all most popular    live tech help now april tech award number million open job new com top cio zdnet peoplesoft                                                                       canon powershot s number elph  number number number number pixel  number number number x digit zoom  number number x optic zoom  number builtin flash  number just dollar number               accessor your camera      canon cb number ls batteri charger    canon ack number power adapt kit     canon number mb compactflash card    canon number mb compactflash card                                             fujifilm finepix number i  number number number number pixel  number

----------------
Processed email:
----------------
tech updat todai vital sign for juli number number  david berlind what to look for in your next smart phone with pda and cell phone on the fast path to converg choos the right devic is harder than ever befor you bui on more handheld see my prescript for mobil happi   reader kyocera is step stone to the perfect smart phone latest from zdnet new microsoft ey visa user with passport no jitter here ibm drive tap new tech telecom firm leak student data to web israel hong kong hotb for cyberattack microsoft put content softwar to the test micron pc demand to boost chip sale flash macromedia tool speak up will dell delv into printer more enterpris new farber pick  dan farber tackl tough issu in ecustom servic ident servic let ecommerc site better serv custom and secur their transact both passport and the liberti allianc offer merchant some abil to recogn custom check out thi overview of todai ident servicesand find out why pki is not dead yet

----------------
Processed email:
----------------
digit dispatch weekli newslett                 all cnet the web nbsp     appl to expand imac lcd displai it aint heavi it my laptop gatewai tout chic yet cheap pc appl ipod come to linux dell pc come to a mall near you  more cnet new           quintessenti player number number    ai pictur util number number number     icq number a build number     deck number number for the mac            dell latitud c number c number seri nbsp  in hardwar    toshiba pocket pc e number  in electron   autocad lt number  in softwar   soni ericsson t number  in wireless                                  juli number number         janic chen editor in chief cnet review      dear reader  it wa a crush blow to discov that vindigo my alltimefavorit palm app wa no longer free twentyf buck wasnt steep but the principl of pai for a former freebi wa hard to swallow not that hard though surpris myselfand my fellow cheapskates discov that id rather dish out the do

----------------
Processed email:
----------------
the tech side of homeland defens          nbsp search  nbsp  nbsp  new com  all cnet nbsp nbsp nbsp nbsp  the web nbsp live tech help now april tech award number million open job new com top cio zdnet peoplesoft                         juli number number      the tech side of homeland defens  studio sue defunct dollar number movi site  dollar number million later startup pluri shut down  appl to expand imac lcd displai  nextcard cut off credit card custom  tap china brainpow          nbsp vision seri          read new com exclus interview of number top cio  vision seri home   the tech side of homeland defens comput secur is becom an increasingli critic part of presid bush propos for a depart of homeland secur as politician fret about techsavvi terroristsand insist that ani new agenc must shield the unit state from electron attack meanwhil the administr sai that it open to the idea of a chief privaci offic for the agenc juli number numb

----------------
Processed email:
----------------
linuxannounc digest number volum number sat number sep number number number number edt content oo program newslett number from bruce eckel emailaddr the linux counter frederick noronha linux gazett number septemb number avail linux gazett bigtwo coffe marilith number releas bernard yap new gnu develop intl orgn take a close look at gnu linux frederick noronha attn programm support offer flosssarai initi frederick noronha  from emailaddr subject oo program newslett number from bruce eckel date number sep number number number number gmt replyto emailaddr to unsubscrib send a blank email to leaveeckelooprogram number emailaddr you ar subscrib as emailaddr number ctd ornl gov to modifi your email address go to httpaddr oo program newslett number from bruce eckel august number content come work on think in java in crest butt think in java number rd edit revis number immin think in c volum number revis number avail think in java handson semi

----------------
Processed email:
----------------
 click here to read thi newslett on the web for free httpaddr number html dont worri about anyth els below thi paragraph it for those with html email client just go to the abov list web address to view thi issu perfectli in your web browser thank again  number lockergnom penguin shell nbsp number number number penguinreport career servic from lockergnom and dice com weve team up with dice com to bring you a full servic i t career enhanc solut whether you ar look for your dream job or try to hire talent peopl the fullfeatur career resourc center is the place to start find your it talent solut todai your a good bunch you penguin todai gnomevoic realli tell the stori better than i can but ill give you a quick summari through your effort and compass reader karl steenblik ha found a new electron home for the foundat for children and youth with diabet ill let karl tell you which offer solut suit hi need but with the volum of respons hi reque

In [35]:
vocabulary = {k: v for k, v in sorted(vocab.items(), key=lambda item: item[1])}
vocabulary = dict(list(vocabulary.items())[-2000:])  
print(vocabulary)

{'tune': 42, 'corp': 42, 'basi': 42, 'paragraph': 42, 'tmp': 42, 'tree': 42, 'dark': 42, 'nxieaa': 42, 'boot': 42, 'prix': 42, 'neverfail': 42, 'strength': 43, 'budget': 43, 'seven': 43, 'cancer': 43, 'ng': 43, 'llc': 43, 'li': 43, 'igzhi': 43, 'worldwid': 43, 'arrest': 43, 'portabl': 43, 'affect': 43, 'sm': 43, 'ih': 43, 'ex': 43, 'nd': 43, 'die': 43, 'consider': 43, 'lookup': 43, 'prompt': 43, 'skill': 43, 'prove': 43, 'javamail': 43, 'ybb': 43, 'gab': 43, 'disc': 43, 'straight': 43, 'tweak': 43, 'viru': 43, 'hole': 43, 'mathemat': 43, 'nonspam': 43, 'lawsuit': 43, 'duncan': 43, 'afford': 44, 'width': 44, 'al': 44, 'citizen': 44, 'restrict': 44, 'aim': 44, 'ls': 44, 'flow': 44, 'zip': 44, 'forum': 44, 'dc': 44, 'hire': 44, 'scheme': 44, 'park': 44, 'infrastructur': 44, 'writer': 44, 'feet': 44, 'submiss': 44, 'decad': 44, 'planet': 44, 'shout': 44, 'harlei': 44, 'argu': 44, 'kevin': 44, 'concentr': 44, 'alsadriv': 44, 'gnome': 44, 'panasa': 44, 'temperatur': 44, 'theyv': 44, 'sarun':

In [36]:
vocabulary = list(vocabulary.keys())


In [37]:
vocabulary


['tune',
 'corp',
 'basi',
 'paragraph',
 'tmp',
 'tree',
 'dark',
 'nxieaa',
 'boot',
 'prix',
 'neverfail',
 'strength',
 'budget',
 'seven',
 'cancer',
 'ng',
 'llc',
 'li',
 'igzhi',
 'worldwid',
 'arrest',
 'portabl',
 'affect',
 'sm',
 'ih',
 'ex',
 'nd',
 'die',
 'consider',
 'lookup',
 'prompt',
 'skill',
 'prove',
 'javamail',
 'ybb',
 'gab',
 'disc',
 'straight',
 'tweak',
 'viru',
 'hole',
 'mathemat',
 'nonspam',
 'lawsuit',
 'duncan',
 'afford',
 'width',
 'al',
 'citizen',
 'restrict',
 'aim',
 'ls',
 'flow',
 'zip',
 'forum',
 'dc',
 'hire',
 'scheme',
 'park',
 'infrastructur',
 'writer',
 'feet',
 'submiss',
 'decad',
 'planet',
 'shout',
 'harlei',
 'argu',
 'kevin',
 'concentr',
 'alsadriv',
 'gnome',
 'panasa',
 'temperatur',
 'theyv',
 'sarun',
 'hotel',
 'expir',
 'amaz',
 'histor',
 'reli',
 'england',
 'journal',
 'xvcj',
 'sundai',
 'queri',
 'sp',
 'zp',
 'ba',
 'committe',
 'startup',
 'super',
 'elsewher',
 'flist',
 'crack',
 'transform',
 'pickl',
 'candid

In [38]:
def processEmail(email_contents,vocabulary,verbose=True):
    """
    Preprocesses the body of an email and returns a list of indices 
    of the words contained in the email.    
    
    Parameters
    ----------
    email_contents : str
        A string containing one email. 
    
    verbose : bool
        If True, print the resulting email after processing.
    
    Returns
    -------
    word_indices : list
        A list of integers containing the index of each word in the 
    """
    # Load Vocabulary
    vocabList = vocabulary
    word_indices = []
    # Lower case
    hdrstart = email_contents.find(chr(10) + chr(10))
    email_contents = email_contents[hdrstart:]
    email_contents = email_contents.lower()
    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents =re.compile('<[^<>]+>').sub(' ', email_contents)
    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)
    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.compile('(http|https)://[^\s]*').sub(' httpaddr ', email_contents)
    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents)
    # Handle $ sign
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)
    # get rid of any punctuation
    email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},''">_<;%\n\r]', email_contents)
    # remove any empty word string
    email_contents = [word for word in email_contents if len(word) > 0]
    # Stem the email contents word by word
    stemmer = utils.PorterStemmer()
    processed_email = []
    for word in email_contents:
        # Remove any remaining non alphanumeric characters in word
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)

        if len(word) < 1:
            continue

        if word in vocabList:
            word_indices.append(vocabList.index(word))


    if verbose:
        print('----------------')
        print('Processed email:')
        print('----------------')
        print(' '.join(processed_email))
    return word_indices

In [39]:
#for emails without headers
def processEmail1(email_contents,vocabulary,verbose=True):
    """
    Preprocesses the body of an email and returns a list of indices 
    of the words contained in the email.    
    
    Parameters
    ----------
    email_contents : str
        A string containing one email. 
    
    verbose : bool
        If True, print the resulting email after processing.
    
    Returns
    -------
    word_indices : list
        A list of integers containing the index of each word in the 
    """
    # Load Vocabulary
    vocabList = vocabulary
    word_indices = []
    # Lower case
    email_contents = email_contents.lower()
    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents =re.compile('<[^<>]+>').sub(' ', email_contents)
    # Handle Numbers
    # Look for one or more characters between 0-9
    email_contents = re.compile('[0-9]+').sub(' number ', email_contents)
    # Handle URLS
    # Look for strings starting with http:// or https://
    email_contents = re.compile('(http|https)://[^\s]*').sub(' httpaddr ', email_contents)
    # Handle Email Addresses
    # Look for strings with @ in the middle
    email_contents = re.compile('[^\s]+@[^\s]+').sub(' emailaddr ', email_contents)
    # Handle $ sign
    email_contents = re.compile('[$]+').sub(' dollar ', email_contents)
    # get rid of any punctuation
    email_contents = re.split('[ @$/#.-:&*+=\[\]?!(){},''">_<;%\n\r]', email_contents)
    # remove any empty word string
    email_contents = [word for word in email_contents if len(word) > 0]
    # Stem the email contents word by word
    stemmer = utils.PorterStemmer()
    processed_email = []
    for word in email_contents:
        # Remove any remaining non alphanumeric characters in word
        word = re.compile('[^a-zA-Z0-9]').sub('', word).strip()
        word = stemmer.stem(word)
        processed_email.append(word)

        if len(word) < 1:
            continue

        if word in vocabList:
            word_indices.append(vocabList.index(word))


    if verbose:
        print('----------------')
        print('Processed email:')
        print('----------------')
        print(' '.join(processed_email))
    return word_indices

In [40]:
with open(os.path.join('Data', 'emailSample1.txt')) as fid:
    file_contents = fid.read()
print(file_contents)
word_indices  = processEmail1(file_contents,vocabulary)

#Print Stats
print('-------------')
print('Word Indices:')
print('-------------')
print(word_indices)

> Anyone knows how much it costs to host a web portal ?
>
Well, it depends on how many visitors you're expecting.
This can be anywhere from less than 10 bucks a month to a couple of $100. 
You should checkout http://www.rackspace.com/ or perhaps Amazon EC2 
if youre running something big..

To unsubscribe yourself from this mailing list, send an email to:
groupname-unsubscribe@egroups.com


----------------
Processed email:
----------------
anyon know how much it cost to host a web portal well it depend on how mani visitor your expect thi can be anywher from less than number buck a month to a coupl of dollar number you should checkout httpaddr or perhap amazon ec number if your run someth big to unsubscrib yourself from thi mail list send an email to emailaddr
-------------
Word Indices:
-------------
[1658, 1849, 1905, 1805, 1988, 1483, 1996, 1244, 1995, 1898, 1812, 1988, 1375, 1984, 1905, 1799, 1979, 1395, 1983, 1959, 1978, 491, 1976, 1603, 1897, 1999, 1995, 1683, 1996, 1995, 1082, 1

In [41]:
def emailFeatures(word_indices,vocabulary):
    """
    Takes in a word_indices vector and produces a feature vector from the word indices. 
    
    Parameters
    ----------
    word_indices : list
        A list of word indices from the vocabulary list.
    
    Returns
    -------
    x : list 
        The computed feature vector.
    """
    # Total number of words in the dictionary
    n = len(vocabulary)
    x = np.zeros(n)
    for idx in word_indices:
        x[idx] = 1
    return x

In [42]:
with open(os.path.join('Data', 'emailSample1.txt')) as fid:
    file_contents = fid.read()

word_indices  = processEmail1(file_contents,vocabulary)
features      = emailFeatures(word_indices,vocabulary)

# Print Stats
print('\nLength of feature vector: %d' % len(features))
print('Number of non-zero entries: %d' % sum(features > 0))

----------------
Processed email:
----------------
anyon know how much it cost to host a web portal well it depend on how mani visitor your expect thi can be anywher from less than number buck a month to a coupl of dollar number you should checkout httpaddr or perhap amazon ec number if your run someth big to unsubscrib yourself from thi mail list send an email to emailaddr

Length of feature vector: 2000
Number of non-zero entries: 45


In [43]:
X = np.array([])
count = 0
for filename in os.listdir('spam'):
    file = os.path.join('spam',filename)
    if os.path.isfile(file):
        fid = open(file)
    else:
        continue
    
    try:
        email_contents = fid.read()
        processedContents = processEmail(email_contents,vocabulary,False)
        xi = np.array(emailFeatures(processedContents,vocabulary))
        xi = np.concatenate((xi,[1]),axis = 0)
        X = np.concatenate((X,xi),axis = 0)
        count += 1
    except:
        pass
X = np.reshape(X,(count,-1))
print(X.shape)
print(count)

(416, 2001)
416


In [44]:
count = 0
X1 = np.array([])
for filename in os.listdir('easy_ham'):
    file = os.path.join('easy_ham',filename)
    if os.path.isfile(file):
        fid = open(file)
    else:
        continue
    
    try:
        email_contents = fid.read()
        processedContents = processEmail(email_contents,vocabulary,False)
        xi = np.array(emailFeatures(processedContents,vocabulary))
        xi = np.concatenate((xi,[0]),axis = 0)
        X1 = np.concatenate((X1,xi),axis = 0)
        count += 1
    except:
        pass
X1 = np.reshape(X1,(count,-1))
print(X1.shape)
X = np.concatenate((X,X1),axis = 0)
print(count)

(2410, 2001)
2410


In [45]:
print(X.shape)

(2826, 2001)


In [46]:
count = 0
X2 = np.array([])
for filename in os.listdir('hard_ham'):
    file = os.path.join('hard_ham',filename)
    if os.path.isfile(file):
        fid = open(file)
    else:
        continue
    
    try:
        email_contents = fid.read()
        processedContents = processEmail(email_contents,vocabulary,False)
        xi = np.array(emailFeatures(processedContents,vocabulary))
        xi = np.concatenate((xi,[0]),axis = 0)
        X2 = np.concatenate((X2,xi),axis = 0)
        count += 1
    except:
        pass
X2 = np.reshape(X2,(count,-1))
print(X2.shape)
X = np.concatenate((X,X2),axis = 0)
print(count)

(221, 2001)
221


In [47]:
print(X.shape)

(3047, 2001)


In [48]:
np.random.shuffle(X)

In [49]:
print(X.shape)

(3047, 2001)


In [50]:
size = X.shape[0]
trainSize = (size*7)//10
testSize = size - trainSize

In [51]:
X_train = X[:trainSize,:]
X_test = X[trainSize : ,:]
print(X_train.shape,X_test.shape)

(2132, 2001) (915, 2001)


In [52]:
y_train = X_train[:,-1]
X_train = X_train[:,:2000]
y_test = X_test[:,-1]
X_test = X_test[:,:2000]

In [53]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

(2132, 2000) (915, 2000) (2132,) (915,)


In [54]:
clf = svm.SVC(kernel='linear',C = 0.1) 
# fitting x samples and y classes 
clf.fit(X_train, y_train) 

SVC(C=0.1, kernel='linear')

In [55]:
# Compute the training accuracy
p = clf.predict(X_train)
print('Training Accuracy: %.2f' % (np.mean(p == y_train) * 100))

Training Accuracy: 99.91


In [56]:
print('Evaluating the trained Linear SVM on a test set ...')
p = clf.predict(X_test)

print('Test Accuracy: %.2f' % (np.mean(p == y_test) * 100))

Evaluating the trained Linear SVM on a test set ...
Test Accuracy: 98.25


In [58]:
arr = clf.coef_[0]
top15 = arr.argsort()[-15:][::-1]
print("15 words are the most likely indicators of spam :")
for ele in top15:
    print(vocabulary[ele])

15 words are the most likely indicators of spam :
click
remov
our
aw
visit
your
no
dollar
b
free
will
here
todai
price
offer
