In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # makes the notebook fill the whole window

import os
import re
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
directories = ['./easy_ham/','./hard_ham/','./spam/']
listofdf = []

# Getting the subjects
for directory in directories:
    turnToDf = [] #list of rows to turn into a dataframe
    exten = re.search('/(.*)/',directory).group(1) #get the folder it belongs to
    #open every file in folders
    for filename in os.listdir(directory):
        subject = ''
        f = open(directory + filename,'r',errors='ignore')
        lines = f.readlines()
        #Find the subject line, and get the part after it
        #if there's a Subject: Re:, only get the part after the re:
        for line in lines:
            if 'subject' in line.lower():
                line = line.rstrip()
                if line.lower().find('subject:') == 0:
                    if line.lower().find('re:') != -1:
                        subject = re.search('re:(.*)', line.lower()).group(1)
                    else:
                        subject = re.search(':(.*)', line.lower()).group(1)
        #Split the subject into a list based on spaces
        #Remove numbers from the subject
        subject = subject.split(' ')
        while '' in subject:
            subject.remove('')
        #Remove numbers and words that don't contain letters
        subject = [s for s in subject if not re.search(r'\d',s)]
        subject = [s for s in subject if s.isalpha()]
        #If it's spam, mark it as spam in the dataframe
        if 'spam' in directory:
            turnToDf.append([exten,filename[:4],subject,1])
        else:
            turnToDf.append([exten,filename[:4],subject,0])
    #Create a dataframe from the rows
    listofdf.append(pd.DataFrame(turnToDf,columns=['folder','fileNo','words','isSpam']))

In [3]:
#Create a dataframe of all messages
totalDf = listofdf[0].append(listofdf[1]).append(listofdf[2])
totalDf.reset_index(drop=True,inplace=True)
totalDf['ind'] = totalDf.index

#Samples 3/4 of all messages, taking every 4th message
trainingDf = totalDf[totalDf['folder'] == 'easy_ham'].sample(int(3*len(totalDf[totalDf['folder'] == 'easy_ham'])/4),weights=[0 if x%4==0 else 1 for x in range(len(totalDf[totalDf['folder'] == 'easy_ham']))]).append(totalDf[totalDf['folder'] == 'hard_ham'].sample(int(3*len(totalDf[totalDf['folder'] == 'hard_ham'])/4),weights=[0 if x%4==0 else 1 for x in range(len(totalDf[totalDf['folder'] == 'hard_ham']))])).append(totalDf[totalDf['folder'] == 'spam'].sample(int(3*len(totalDf[totalDf['folder'] == 'spam'])/4),weights=[0 if x%4==0 else 1 for x in range(len(totalDf[totalDf['folder'] == 'spam']))]))
trainingDf.sort_index(inplace=True)

#Create the testing dataset
testingDf = totalDf[~totalDf.ind.isin(totalDf.merge(trainingDf,on=['ind']).ind)]
trainingDf

Unnamed: 0,folder,fileNo,words,isSpam,ind
1,easy_ham,0002,[alexander],0,1
2,easy_ham,0003,"[moscow, bomber]",0,2
3,easy_ham,0004,"[the, virus, that, die]",0,3
5,easy_ham,0006,"[nothing, like, mama, used, to, make]",0,5
6,easy_ham,0007,"[nothing, like, mama, used, to, make]",0,6
...,...,...,...,...,...
3295,spam,0494,"[make, money, giving, away, free]",1,3295
3296,spam,0495,"[marketing, is, working]",1,3296
3298,spam,0497,"[hit, the, road, with, cna]",1,3298
3299,spam,0498,"[a, hour, for, watching, no]",1,3299


In [4]:
spamDict = {}
for row in trainingDf[trainingDf['isSpam'] == 1].itertuples():
    for word in row.words:
        if word not in spamDict:
            spamDict[word] = 1
        else:
            spamDict[word] += 1
sSpam = {k: v for k, v in sorted(spamDict.items(), key=lambda item: item[1],reverse=True)}

hamDict = {}
for row in trainingDf[trainingDf['isSpam'] == 0].itertuples():
    for word in row.words:
        if word not in hamDict:
            hamDict[word] = 1
        else:
            hamDict[word] += 1
sHam = {k: v for k, v in sorted(hamDict.items(), key=lambda item: item[1],reverse=True)}

allWords = {}
for word in sSpam:
    if word not in allWords:
        allWords[word] = sSpam[word]
    else:
        allWords[word] += sSpam[word]
for word in sHam:
    if word not in allWords:
        allWords[word] = sHam[word]
    else:
        allWords[word] += sHam[word]
wordsSorted = {k: v for k, v in sorted(allWords.items(), key=lambda item: item[1],reverse=True)}
wordsInOrder = list(wordsSorted)

turnToDf = []
for word in allWords:    
    spamAmount = 0
    hamAmount = 0
    totalAmount = allWords[word]
    
    if word in sSpam:
        spamAmount = sSpam[word]
    if word in sHam:
        hamAmount = sHam[word]
    turnToDf.append([word, hamAmount, spamAmount, totalAmount])

wordDF = pd.DataFrame(turnToDf, columns=['word', 'times_ham','times_spam','total_count'])
wordDF

Unnamed: 0,word,times_ham,times_spam,total_count
0,your,29,49,78
1,the,306,40,346
2,for,229,38,267
3,a,124,35,159
4,you,18,33,51
...,...,...,...,...
2906,motions,1,0,1
2907,filed,1,0,1
2908,morpheus,1,0,1
2909,securing,1,0,1


In [5]:
totalSpam = wordDF['times_spam'].sum()
totalHam = wordDF['times_ham'].sum()
totalCount = wordDF['total_count'].sum()
probSpam = totalSpam/totalCount
probHam = totalHam/totalCount

a = 1 #alpha value for smoothing
b = 2 #beta value for smoothing

wordDF['p_spam'] = (wordDF['times_spam'] + a) / (b + totalSpam)
wordDF['p_ham'] = (wordDF['times_ham'] + a) / (b + totalHam)

wordDF['p_sWord'] = (wordDF['times_spam'] + a) / (b + wordDF['total_count'])
wordDF['p_hWord'] = (wordDF['times_ham'] + a) / (b + wordDF['total_count'])

print("Top 5 spammiest words\n",wordDF[['word','times_ham','times_spam','p_sWord','p_hWord']].sort_values(by='p_sWord',ascending=False).head(5))
print("Top 5 hammiest words\n",wordDF[['word','times_ham','times_spam','p_sWord','p_hWord']].sort_values(by='p_hWord',ascending=False).head(5))

Top 5 spammiest words
            word  times_ham  times_spam   p_sWord   p_hWord
16        rates          0          12  0.928571  0.071429
20         year          0          10  0.916667  0.083333
24     mortgage          0           8  0.900000  0.100000
25          per          0           8  0.900000  0.100000
26  systemworks          0           8  0.900000  0.100000
Top 5 hammiest words
         word  times_ham  times_spam   p_sWord   p_hWord
693  problem         32           0  0.029412  0.970588
694     java         31           0  0.030303  0.969697
695    razor         30           0  0.031250  0.968750
698    bliss         28           0  0.033333  0.966667
697  selling         28           0  0.033333  0.966667


# Stopped because it takes like 35 minutes each time to run

In [6]:
%%time
np.seterr('raise')
trainDict = {}
for row in trainingDf.itertuples():
    indexes = []
    for word in row.words:
        indexes.append(wordDF[wordDF['word'] == word].index[0])
    # Assume every word isn't in there
    t_vec = np.array(1 - wordDF['p_spam'].values,dtype='float64')
    b_vec = np.array(1 - wordDF['p_ham'].values,dtype='float64')
    # Correct the words that are there
    for ind in indexes:
        t_vec[ind] = wordDF.iloc[ind].p_spam
        b_vec[ind] = wordDF.iloc[ind].p_ham
    top = np.prod(t_vec)
    bot = np.prod(b_vec)
    trainDict[row.Index] = (top * probSpam)/(top * probSpam + bot * probHam)

Wall time: 10.2 s


In [7]:
spammiestEmails = {k: v for k, v in sorted(trainDict.items(), key=lambda item: item[1],reverse=True)}

In [8]:
threshold = .5
spams = {k:v for (k,v) in spammiestEmails.items() if v >= threshold}
makeDf = []
for index in spams:
    makeDf.append([trainingDf[trainingDf['ind'] == index].folder.values[0],trainingDf[trainingDf['ind'] == index].isSpam.values[0],trainingDf[trainingDf['ind'] == index].ind.values[0],spams[index]])
pd.DataFrame(makeDf,columns=['folder','isSpam','ind','calcP'])

Unnamed: 0,folder,isSpam,ind,calcP
0,spam,1,2916,1.000000
1,spam,1,2874,1.000000
2,spam,1,2926,1.000000
3,spam,1,3027,1.000000
4,spam,1,3175,1.000000
...,...,...,...,...
567,hard_ham,0,2657,0.505188
568,easy_ham,0,2337,0.504781
569,hard_ham,0,2713,0.504437
570,easy_ham,0,7,0.501096


# Run everything on the testing data now

In [9]:
spamDict = {}
for row in testingDf[testingDf['isSpam'] == 1].itertuples():
    for word in row.words:
        if word not in spamDict:
            spamDict[word] = 1
        else:
            spamDict[word] += 1
sSpam = {k: v for k, v in sorted(spamDict.items(), key=lambda item: item[1],reverse=True)}

hamDict = {}
for row in testingDf[testingDf['isSpam'] == 0].itertuples():
    for word in row.words:
        if word not in hamDict:
            hamDict[word] = 1
        else:
            hamDict[word] += 1
sHam = {k: v for k, v in sorted(hamDict.items(), key=lambda item: item[1],reverse=True)}

allWords = {}
for word in sSpam:
    if word not in allWords:
        allWords[word] = sSpam[word]
    else:
        allWords[word] += sSpam[word]
for word in sHam:
    if word not in allWords:
        allWords[word] = sHam[word]
    else:
        allWords[word] += sHam[word]
wordsSorted = {k: v for k, v in sorted(allWords.items(), key=lambda item: item[1],reverse=True)}
wordsInOrder = list(wordsSorted)

turnToDf = []
for word in allWords:    
    spamAmount = 0
    hamAmount = 0
    totalAmount = allWords[word]
    
    if word in sSpam:
        spamAmount = sSpam[word]
    if word in sHam:
        hamAmount = sHam[word]
    turnToDf.append([word, hamAmount, spamAmount, totalAmount])

wordDF = pd.DataFrame(turnToDf, columns=['word', 'times_ham','times_spam','total_count'])

totalSpam = wordDF['times_spam'].sum()
totalHam = wordDF['times_ham'].sum()
totalCount = wordDF['total_count'].sum()
probSpam = totalSpam/totalCount
probHam = totalHam/totalCount

a = 1 #alpha value for smoothing
b = 2 #beta value for smoothing

wordDF['p_spam'] = (wordDF['times_spam'] + a) / (b + totalSpam)
wordDF['p_ham'] = (wordDF['times_ham'] + a) / (b + totalHam)

wordDF['p_sWord'] = (wordDF['times_spam'] + a) / (b + wordDF['total_count'])
wordDF['p_hWord'] = (wordDF['times_ham'] + a) / (b + wordDF['total_count'])

print("Top 5 spammiest words\n",wordDF[['word','times_ham','times_spam','p_sWord','p_hWord']].sort_values(by='p_sWord',ascending=False).head(5))
print("Top 5 hammiest words\n",wordDF[['word','times_ham','times_spam','p_sWord','p_hWord']].sort_values(by='p_hWord',ascending=False).head(5))

Top 5 spammiest words
            word  times_ham  times_spam   p_sWord   p_hWord
17         need          0           5  0.857143  0.142857
12        rates          0           5  0.857143  0.142857
13         have          0           5  0.857143  0.142857
25        money          0           4  0.833333  0.166667
23  systemworks          0           4  0.833333  0.166667
Top 5 hammiest words
           word  times_ham  times_spam   p_sWord   p_hWord
344       from         20           0  0.045455  0.954545
345       spam         16           0  0.055556  0.944444
240        new         32           1  0.057143  0.942857
346    problem         12           0  0.071429  0.928571
348  headlines         11           0  0.076923  0.923077


In [10]:
%%time
# This mimics the Naive Bayes formula, with top being the top portion of the fraction, and bot being the other part of the bottom
np.seterr('raise')
trainDict = {}
for row in testingDf.itertuples():
    indexes = []
    for word in row.words:
        indexes.append(wordDF[wordDF['word'] == word].index[0])
    # Assume every word isn't in there
    t_vec = np.array(1 - wordDF['p_spam'].values,dtype='float64')
    b_vec = np.array(1 - wordDF['p_ham'].values,dtype='float64')
    # Correct the words that are there
    for ind in indexes:
        t_vec[ind] = wordDF.iloc[ind].p_spam
        b_vec[ind] = wordDF.iloc[ind].p_ham
    top = np.prod(t_vec)
    bot = np.prod(b_vec)
    trainDict[row.Index] = (top * probSpam)/(top * probSpam + bot * probHam)

Wall time: 3.16 s


In [11]:
print("Threshold\t\tAccuracy\t\tPrecision\t\tRecall")
for threshold in [.1,.2,.3,.4,.5,.6,.7,.8,.9]:
    spammiestEmails = {k: v for k, v in sorted(trainDict.items(), key=lambda item: item[1],reverse=True)}
    spams = {k:v for (k,v) in spammiestEmails.items() if v >= threshold}
    hams = {k:v for (k,v) in spammiestEmails.items() if v < threshold}
    makeDf = []
    for index in spams:
        makeDf.append([testingDf[testingDf['ind'] == index].folder.values[0],testingDf[testingDf['ind'] == index].isSpam.values[0],testingDf[testingDf['ind'] == index].ind.values[0],spams[index]])
    spam = pd.DataFrame(makeDf,columns=['folder','isSpam','ind','calcP'])
    spam['cS'] = 1
    makeDf = []
    for index in hams:
        makeDf.append([testingDf[testingDf['ind'] == index].folder.values[0],testingDf[testingDf['ind'] == index].isSpam.values[0],testingDf[testingDf['ind'] == index].ind.values[0],hams[index]])
    ham = pd.DataFrame(makeDf,columns=['folder','isSpam','ind','calcP'])
    ham['cS'] = 0
    guesses = spam.append(ham)

    predSpamSpam = 0
    predSpamHam = 0
    predHamHam = 0
    predHamSpam = 0

    for row in guesses.itertuples():
        if row.cS == 1:
            if row.isSpam == 1:
                predSpamSpam +=1
            else:
                predSpamHam += 1
        else:
            if row.isSpam == 1:
                predHamSpam += 1
            else:
                predHamHam += 1

    accuracy = (predSpamSpam + predHamHam)/len(guesses)
    precision = predSpamSpam/(predSpamSpam + predSpamHam)
    recall = predSpamSpam/(predSpamSpam + predHamSpam)
    print(threshold,"",accuracy,precision,recall,sep='\t')

Threshold		Accuracy		Precision		Recall
0.1		0.6432889963724304	0.2903225806451613	0.9285714285714286
0.2		0.7557436517533253	0.3758169934640523	0.9126984126984127
0.3		0.8065296251511487	0.43410852713178294	0.8888888888888888
0.4		0.848851269649335	0.5023041474654378	0.8650793650793651
0.5		0.8851269649334945	0.5837837837837838	0.8571428571428571
0.6		0.9032648125755743	0.6402439024390244	0.8333333333333334
0.7		0.9129383313180169	0.684931506849315	0.7936507936507936
0.8		0.9165659008464329	0.7142857142857143	0.753968253968254
0.9		0.9334945586457074	0.8317757009345794	0.7063492063492064
