In [1]:
from IPython.core.display import HTML
from IPython.display import display
display(HTML("<style>.container { width:100% !important; }</style>")) # makes the notebook fill the whole window

import os
import re
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
directories = ['./easy_ham/','./hard_ham/','./spam/']
listofdf = []

# Getting the subjects
for directory in directories:
    turnToDf = [] # List of rows to turn into a dataframe
    exten = re.search('/(.*)/',directory).group(1) # Get the folder it belongs to
    # Open every file in folders
    for filename in os.listdir(directory):
        subject = ''
        with open(directory + filename, "r", errors = "ignore") as f:
            lines = f.readlines()
            # Find the subject line, and get the part after it
            # If there's a Subject: Re:, only get the part after the re:
            for line in lines:
                if 'subject' in line.lower():
                    line = line.rstrip()
                    if line.lower().find('subject:') == 0:
                        if line.lower().find('re:') != -1:
                            subject = re.search('re:(.*)', line.lower()).group(1)
                        else:
                            subject = re.search(':(.*)', line.lower()).group(1)

            # Use another regular expression for cleaning
            # subject = re.sub("[^a-zA-Z]+", " ", subject)

            # Split the subject into a list based on spaces
            # Remove numbers from the subject
            # Remove numbers and words that don't contain letters
            subject = [s for s in subject.split(' ') if len(s)]
            subject = [s for s in subject if not re.search(r'\d',s)]
            subject = [s for s in subject if s.isalpha()]
            
            # If it's spam, mark it as spam in the dataframe
            turnToDf.append([exten,filename[:4],subject, int("spam" in directory)])
    # Create a dataframe from the rows
    listofdf.append(pd.DataFrame(turnToDf,columns=['folder','fileNo','words','isSpam']))

In [3]:
# Create a dataframe of all messages
totalDf = pd.concat(listofdf)
totalDf.reset_index(drop = True, inplace = True)
totalDf['ind'] = totalDf.index

def sample_every_four(df, columnm, column_value):
    temp = df[df[columnm] == column_value].sample(
        int(3 * len(totalDf[totalDf[columnm] == column_value]) / 4),
        weights=[0 if x % 4 == 0 else 1 for x in range(len(df[df[columnm] == column_value]))])
    return temp

# Samples 3/4 of all messages, taking every 4th message
first = sample_every_four(totalDf, "folder", "easy_ham")
second = sample_every_four(totalDf, "folder", "hard_ham")
third = sample_every_four(totalDf, "folder", "spam")

trainingDf = pd.concat([first, second, third])
trainingDf.sort_index(inplace = True)

# Create the testing dataset
testingDf = totalDf[~totalDf.ind.isin(totalDf.merge(trainingDf, on = ['ind']).ind)]
trainingDf

Unnamed: 0,folder,fileNo,words,isSpam,ind
1,easy_ham,0461,"[making, a, mesh, on, the, move]",0,1
2,easy_ham,1077,[sorting],0,2
3,easy_ham,1000,"[java, is, for, kiddies]",0,3
5,easy_ham,0401,[canadians],0,5
6,easy_ham,0502,[],0,6
...,...,...,...,...,...
3295,spam,0361,"[you, satisfy, me, fgtpril]",1,3295
3296,spam,0039,"[your, bank, account]",1,3296
3298,spam,0312,"[have, tax]",1,3298
3299,spam,0116,"[custom, websites, for]",1,3299


In [4]:
def generate_dictionary(df, column, column_value):
    temp_dict = {}
    for row in df[df[column] == column_value].itertuples():
        for word in row.words:
            if word not in temp_dict:
                temp_dict[word] = 1
            else:
                temp_dict[word] += 1
    return {k:v for k, v in sorted(temp_dict.items(), key = lambda item: item[1], reverse = True)}

sSpam = generate_dictionary(trainingDf, "isSpam", 1)
sHam = generate_dictionary(trainingDf, "isSpam", 0)

allWords = {}
for word in sSpam:
    if word not in allWords:
        allWords[word] = sSpam[word]
    else:
        allWords[word] += sSpam[word]
for word in sHam:
    if word not in allWords:
        allWords[word] = sHam[word]
    else:
        allWords[word] += sHam[word]
wordsSorted = {k: v for k, v in sorted(allWords.items(), key = lambda item: item[1], reverse = True)}
wordsInOrder = list(wordsSorted)

turnToDf = []
for word in allWords:    
    spamAmount = 0
    hamAmount = 0
    totalAmount = allWords[word]
    
    if word in sSpam:
        spamAmount = sSpam[word]
    if word in sHam:
        hamAmount = sHam[word]
    turnToDf.append([word, hamAmount, spamAmount, totalAmount])

wordDF = pd.DataFrame(turnToDf, columns=['word', 'times_ham','times_spam','total_count'])
wordDF

Unnamed: 0,word,times_ham,times_spam,total_count
0,your,21,50,71
1,for,233,41,274
2,a,136,37,173
3,the,298,36,334
4,you,16,35,51
...,...,...,...,...
2911,item,1,0,1
2912,thin,1,0,1
2913,thinkpads,1,0,1
2914,fit,1,0,1


In [5]:
totalSpam = wordDF['times_spam'].sum()
totalHam = wordDF['times_ham'].sum()
totalCount = wordDF['total_count'].sum()
probSpam = totalSpam / totalCount
probHam = totalHam / totalCount

a = 1 # alpha value for smoothing
b = 2 # beta value for smoothing

wordDF['p_spam'] = (wordDF['times_spam'] + a) / (b + totalSpam)
wordDF['p_ham'] = (wordDF['times_ham'] + a) / (b + totalHam)

wordDF['p_sWord'] = (wordDF['times_spam'] + a) / (b + wordDF['total_count'])
wordDF['p_hWord'] = (wordDF['times_ham'] + a) / (b + wordDF['total_count'])

print("Top 5 spammiest words\n", wordDF[['word', 'p_spam', 'p_ham', 'p_sWord', 'p_hWord']].sort_values(by = 'p_sWord', ascending = False).head(5))
print("Top 5 hammiest words\n", wordDF[['word', 'p_spam', 'p_ham', 'p_sWord', 'p_hWord']].sort_values(by = 'p_hWord', ascending = False).head(5))

Top 5 spammiest words
            word    p_spam     p_ham   p_sWord   p_hWord
12        rates  0.008761  0.000126  0.933333  0.066667
14        money  0.008135  0.000126  0.928571  0.071429
22  systemworks  0.006884  0.000126  0.916667  0.083333
25    clearance  0.006258  0.000126  0.909091  0.090909
33      account  0.005006  0.000126  0.888889  0.111111
Top 5 hammiest words
           word    p_spam     p_ham   p_sWord   p_hWord
685    problem  0.000626  0.004924  0.025000  0.975000
686       java  0.000626  0.004040  0.030303  0.969697
689  headlines  0.000626  0.003914  0.031250  0.968750
688      bliss  0.000626  0.003914  0.031250  0.968750
687    selling  0.000626  0.003914  0.031250  0.968750


In [6]:
wordDF

Unnamed: 0,word,times_ham,times_spam,total_count,p_spam,p_ham,p_sWord,p_hWord
0,your,21,50,71,0.031915,0.002778,0.698630,0.301370
1,for,233,41,274,0.026283,0.029545,0.152174,0.847826
2,a,136,37,173,0.023780,0.017298,0.217143,0.782857
3,the,298,36,334,0.023154,0.037753,0.110119,0.889881
4,you,16,35,51,0.022528,0.002146,0.679245,0.320755
...,...,...,...,...,...,...,...,...
2911,item,1,0,1,0.000626,0.000253,0.333333,0.666667
2912,thin,1,0,1,0.000626,0.000253,0.333333,0.666667
2913,thinkpads,1,0,1,0.000626,0.000253,0.333333,0.666667
2914,fit,1,0,1,0.000626,0.000253,0.333333,0.666667


In [7]:
%%time
np.seterr('raise')

trainDict = {}
for row in trainingDf.itertuples():
    indexes = []
    for word in row.words:
        indexes.append(wordDF[wordDF['word'] == word].index[0])
    # Assume every word isn't in there
    t_vec = np.array(1 - wordDF["p_spam"].values, dtype = np.longdouble)
    b_vec = np.array(1 - wordDF["p_ham"].values, dtype = np.longdouble)
    # Correct the words that are there
    for ind in indexes:
        t_vec[ind] = wordDF.iloc[ind]["p_spam"]
        b_vec[ind] = wordDF.iloc[ind]["p_ham"]
    top = np.prod(t_vec)
    bot = np.prod(b_vec)
    trainDict[row.Index] = (top * probSpam) / (top * probSpam + bot * probHam)

CPU times: user 5.94 s, sys: 29.1 ms, total: 5.97 s
Wall time: 5.99 s


In [8]:
spammiestEmails = {k: v for k, v in sorted(trainDict.items(), key = lambda item: item[1], reverse = True)}

In [9]:
threshold = .5
spams = {k:v for k, v in spammiestEmails.items() if v >= threshold}
makeDf = []
for index in spams:
    makeDf.append([trainingDf[trainingDf['ind'] == index]["folder"].values[0],
                   trainingDf[trainingDf['ind'] == index]["isSpam"].values[0],
                   trainingDf[trainingDf['ind'] == index]["ind"].values[0],
                   spams[index]])
pd.DataFrame(makeDf,columns=['folder', 'isSpam', 'ind', 'calcP'])

Unnamed: 0,folder,isSpam,ind,calcP
0,spam,1,2896,1.000000
1,spam,1,3095,1.000000
2,spam,1,3264,1.000000
3,spam,1,3067,1.000000
4,spam,1,3078,1.000000
...,...,...,...,...
549,easy_ham,0,439,0.513412
550,easy_ham,0,146,0.511676
551,easy_ham,0,2221,0.503388
552,easy_ham,0,2113,0.503104


# Run everything on the testing data now

In [10]:
%%time
# This mimics the Naive Bayes formula, with top being the top portion of the fraction, and bot being the other part of the bottom
np.seterr('raise')
trainDict = {}

for row in testingDf.itertuples():
    indexes = []
    for word in row.words:
        tiny = wordDF[wordDF["word"] == word]
        if len(tiny):
            indexes.append(tiny.index[0])

    # Assume every word isn't in there
    t_vec = np.array(1 - wordDF["p_spam"].values, dtype = np.longdouble)
    b_vec = np.array(1 - wordDF["p_ham"].values, dtype = np.longdouble)
    
    # Correct the words that are there
    for ind in indexes:
        t_vec[ind] = wordDF.iloc[ind]["p_spam"]
        b_vec[ind] = wordDF.iloc[ind]["p_ham"]

    top = np.prod(t_vec)
    bot = np.prod(b_vec)
    trainDict[row.Index] = (top * probSpam) / (top * probSpam + bot * probHam)

CPU times: user 1.99 s, sys: 11.4 ms, total: 2 s
Wall time: 2.01 s


In [11]:
spammiestEmails = {k: v for k, v in sorted(trainDict.items(), key = lambda item: item[1], reverse = True)}

print(f"{'Threshold':>12} {'Accuracy':>12} {'Precision':>12} {'Recall':>12}")

for threshold in [.1,.2,.3,.4,.5,.6,.7,.8,.9]:
    spams = {k:v for (k,v) in spammiestEmails.items() if v >= threshold}
    hams = {k:v for (k,v) in spammiestEmails.items() if v < threshold}
    makeDf = []
    for index in spams:
        makeDf.append([testingDf[testingDf['ind'] == index]["folder"].values[0],
                       testingDf[testingDf['ind'] == index]["isSpam"].values[0],
                       testingDf[testingDf['ind'] == index]["ind"].values[0],
                       spams[index]])
    
    spam = pd.DataFrame(makeDf, columns = ['folder', 'isSpam', 'ind', 'calcP'])
    spam['cS'] = 1
    makeDf = []
    for index in hams:
        makeDf.append([testingDf[testingDf['ind'] == index]["folder"].values[0],
                       testingDf[testingDf['ind'] == index]["isSpam"].values[0],
                       testingDf[testingDf['ind'] == index]["ind"].values[0],
                       hams[index]])

    ham = pd.DataFrame(makeDf, columns = ['folder', 'isSpam', 'ind', 'calcP'])
    ham['cS'] = 0
    
    guesses = pd.concat([spam, ham])
    counts = guesses.groupby(["isSpam", "cS"]).size()
    
    predSpamSpam = counts[1][1]
    predSpamHam = counts[1][0]
    predHamSpam = counts[0][1]
    predHamHam = counts[0][0]

    accuracy = (predSpamSpam + predHamHam) / len(guesses)
    precision = predSpamSpam / (predSpamSpam + predSpamHam)
    recall = predSpamSpam / (predSpamSpam + predHamSpam)
    print(f"{threshold:>10}: {accuracy * 100:>10.4}% {precision * 100:>10.4}% {recall * 100:>10.4}%")

   Threshold     Accuracy    Precision       Recall
       0.1:      67.23%      81.75%      29.34%
       0.2:       79.2%      76.19%      40.34%
       0.3:      83.68%      73.81%      47.69%
       0.4:      86.09%      69.05%      53.37%
       0.5:      87.91%      65.87%      59.29%
       0.6:      88.51%      61.11%       62.6%
       0.7:      88.27%      53.97%      63.55%
       0.8:      89.24%       50.0%      70.79%
       0.9:      89.72%      45.24%      78.08%
