# Bayes Theorem for Predicting the Probability of an Email Being Spam

S = Spam
w = Word

$P(Spam|w_{1}, w_{2},..., w_{n}) \propto P(Spam) \cdot \prod_{i=1}^{n}P(w_{i}|Spam)$

The probability that an email consisting of the words $w_{1}, w_{2},... w_{n}$ is proportional to the probability that any given email is spam multiplied by the product of each word's probability to appear in a spam email.



In [1]:
import pandas as pd
import math

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

pd.options.mode.chained_assignment = None  # default='warn'

PREDICTION = 'Prediction'
CLASSIFICATION = 'Classiciation'

## Functions

In [2]:
def count_vocab(emails):
    total_words = 0
    
    for index, row in emails.iterrows():
        total_words += sum(row.values[1:-2])
            
    return total_words

In [3]:
def calculate_word_spamicity(w_spam_count, vocab, spam_vocab):
    alpha = 1
    
    spamicity = (w_spam_count + alpha) / (spam_vocab + alpha * vocab)
    return spamicity

In [4]:
def build_word_spamicity_dict(spam_emails, vocab, spam_vocab):
    spam_word_appearances = {}
    
    for (column_name, column_data) in spam_emails.iteritems():
        if column_name != 'Email No.' and column_name != PREDICTION and column_name != CLASSIFICATION:
            spam_word_appearances[column_name] = sum(column_data.values)
            
    for word in spam_word_appearances:
        spam_word_appearances[word] = calculate_word_spamicity(spam_word_appearances[word], vocab, spam_vocab)
            
    return spam_word_appearances

In [5]:
def calculate_emails(testing_emails, word_spamicities, spam_proportion):
    test_data_map = {}
    for index, email in testing_emails.iterrows():
        email_spamicity = math.log(spam_proportion)
        for column in testing_data.columns[1:-2]:
            if email[column] > 0:
                # log P(spam) + sum log P(w|spam)
                email_spamicity += math.log(word_spamicities[column])*email[column]

        test_data_map[index] = email_spamicity
        
    return test_data_map

In [6]:
def calculate_accuracy(testing_emails):
    number_correct = 0
    for index, email in testing_emails.iterrows():
        if email[PREDICTION] == email[CLASSIFICATION]:
            number_correct += 1
        
    return number_correct / testing_data.shape[0] * 100

# Model

## Step 1: Partition the data into training and test segments

20% of the data for testing, and the remaining 80% is training (i.e. the 80% training data will confirm whether the 20% testing data labels are correct).

In [58]:
df = pd.read_csv('emails.csv')
# Take out all stopwords
for col in df.columns:
    if col in stop_words:
        df.drop(col, axis=1, inplace=True)
        
total_num_emails = df.shape[0]
print(f'Total # Emails:{total_num_emails}')

# Subtract 2 for "Email No." and "Prediction" columns
total_vocab = len(df.columns) - 2
print(f'Total Vocab: {total_vocab}')

partition_size = total_num_emails//5

end = 0
begin = 0
        

for i in range(1,6):
    end += partition_size
    
    if i == 5:
        testing_data = df.iloc[begin:].copy()
    else:
        testing_data = df.iloc[begin:end].copy()

    # This is where the model's prediction will be stored
    testing_data[CLASSIFICATION] = ""
    
    if i == 1:
        training_data = df.iloc[end:]
    elif i == 5:
        training_data = df.iloc[:begin]
    else:
        training_data_sections = []
        training_data_sections.append(df.iloc[:begin])
        training_data_sections.append(df.iloc[end:])
        training_data = pd.concat(training_data_sections)
    
    begin += partition_size
    print(f'\nBegin: {testing_data.at[testing_data.index[0],"Email No."]}')
    print(f'End: {testing_data.at[testing_data.index[-1],"Email No."]}')
    
    spam_proportion = training_data['Prediction'].value_counts()[1] / training_data.shape[0]
    print(f'% of spam emails: {spam_proportion}')
    
    ham_proportion = training_data['Prediction'].value_counts()[0] / training_data.shape[0]
    print(f'% of ham emails: {ham_proportion}')
    
    spam_training_emails = training_data.loc[training_data[PREDICTION] == 1]

    total_spam_words = count_vocab(spam_training_emails)
    print(f'total spam words: {total_spam_words}')
    
    ham_training_emails = training_data.loc[training_data[PREDICTION] == 0]

    total_ham_words = count_vocab(ham_training_emails)
    print(f'total ham words: {total_ham_words}')
    
    word_spamicities = build_word_spamicity_dict(spam_training_emails, total_vocab, total_spam_words)
    word_hamicities = build_word_spamicity_dict(ham_training_emails, total_vocab, total_ham_words)
    
    test_data_spam_map = calculate_emails(testing_data, word_spamicities, spam_proportion)
    test_data_ham_map = calculate_emails(testing_data, word_hamicities, ham_proportion)
    
    print(test_data_spam_map)
    
    if i == 1:
        for key in test_data_spam_map:
            if test_data_spam_map[key] >= test_data_ham_map[key]:
                testing_data[CLASSIFICATION].loc[testing_data.index[key]] = 1
            else:
                testing_data[CLASSIFICATION].loc[testing_data.index[key]] = 0

        score = calculate_accuracy(testing_data)
        print(f'Accuracy: {score}%')

Total # Emails:5172
Total Vocab: 2866

Begin: Email 1
End: Email 1034
% of spam emails: 0.291445142580957
% of ham emails: 0.708554857419043
total spam words: 987979
total ham words: 1754023
{0: -179.6143602253236, 1: -6740.300340547724, 2: -399.3454722655292, 3: -3161.237436180195, 4: -3182.851271107896, 5: -3640.9160742095196, 6: -1904.1304516323157, 7: -1485.9147064294555, 8: -1359.6334470866968, 9: -3810.8486756760335, 10: -5672.470273416164, 11: -10719.80763702939, 12: -6476.894790347868, 13: -3280.428059470295, 14: -1991.5817033972796, 15: -2202.1067532000857, 16: -535.4640290831345, 17: -11293.800167329997, 18: -1164.8085640905667, 19: -2600.0148546154564, 20: -818.4709949144514, 21: -2247.5072703265987, 22: -2878.490777597175, 23: -878.4813799404327, 24: -543.0076569122281, 25: -12673.698138322485, 26: -4100.643348273483, 27: -1225.9418864702943, 28: -5629.354413675017, 29: -6315.001555639633, 30: -1893.0840339754745, 31: -470.4724632947332, 32: -12576.848750051897, 33: -3804.2

Accuracy: 93.23017408123792%

Begin: Email 1035
End: Email 2068
% of spam emails: 0.29337844369260513
% of ham emails: 0.7066215563073949
total spam words: 1018299
total ham words: 1808522
{1034: -8378.710398325542, 1035: -1648.9847787509627, 1036: -258.3913426100897, 1037: -690.4829431351343, 1038: -5850.7001623054775, 1039: -6405.051442110015, 1040: -1538.2878668191079, 1041: -1536.5185376778777, 1042: -8007.0833928432885, 1043: -1099.1567061445514, 1044: -801.6920537140943, 1045: -689.0702646830325, 1046: -15751.627530849124, 1047: -7106.718973805072, 1048: -970.7470322447617, 1049: -760.4667831628658, 1050: -1575.4412655489689, 1051: -2988.7719822968415, 1052: -18712.23235620776, 1053: -4081.4773778564568, 1054: -3161.871865350443, 1055: -772.1558822995324, 1056: -1231.8343626301994, 1057: -6726.783678325533, 1058: -558.2892854974074, 1059: -1387.485935825299, 1060: -8945.589362654595, 1061: -13804.626434962438, 1062: -2296.6819164842505, 1063: -2875.8338990469497, 1064: -13691.505

total spam words: 1099899
total ham words: 1871004
{2068: -457.7989339180055, 2069: -4488.529282189956, 2070: -1255.5040990701996, 2071: -898.3247580895362, 2072: -955.4483286566636, 2073: -2201.4560754747376, 2074: -30.423965566536918, 2075: -5106.662912744623, 2076: -5292.385535805723, 2077: -2125.4399567537116, 2078: -6718.305746417057, 2079: -962.9579716894215, 2080: -1142.2310895915157, 2081: -361.4834927243088, 2082: -231.5184888025135, 2083: -2802.9350990929315, 2084: -1625.8355966506224, 2085: -2705.771092470624, 2086: -1557.5381818740027, 2087: -3441.673810939405, 2088: -486.30173278392175, 2089: -1225.3802700128624, 2090: -490.9108616916731, 2091: -1506.1504582290884, 2092: -4440.841005348953, 2093: -1488.4964397528843, 2094: -363.07253507803034, 2095: -2098.176942062521, 2096: -195.12477409381333, 2097: -1625.8355966506224, 2098: -2705.771092470624, 2099: -3441.673810939405, 2100: -486.30173278392175, 2101: -4067.457538545082, 2102: -933.0012146653165, 2103: -1129.0766414692

total spam words: 990528
total ham words: 1894500
{3102: -964.4138337631797, 3103: -350.3979744973077, 3104: -894.3510826861757, 3105: -444.8391203461436, 3106: -4270.304004473792, 3107: -2740.8593741291943, 3108: -11338.0461422578, 3109: -1455.8439722341523, 3110: -566.0529645929046, 3111: -168.88987864199004, 3112: -2446.3721672675088, 3113: -807.5922726324939, 3114: -350.3979744973077, 3115: -1848.8825841239084, 3116: -341.1347644965581, 3117: -629.6889961949752, 3118: -2178.6944661335087, 3119: -1660.2538662647394, 3120: -418.8172685531574, 3121: -901.5321057594482, 3122: -4358.554884043541, 3123: -1217.4527090734152, 3124: -665.8612917095751, 3125: -12825.458598047944, 3126: -744.4738823676522, 3127: -2127.783891122115, 3128: -2979.4943389919567, 3129: -1864.7037751199375, 3130: -2928.5999915097896, 3131: -2217.8822646537074, 3132: -4335.5669155062305, 3133: -2788.7220022619904, 3134: -4418.804659807539, 3135: -10748.146864310429, 3136: -1072.4478870652686, 3137: -418.817268553157

total spam words: 991291
total ham words: 1717739
{4136: -20033.892207987486, 4137: -1414.1801385844697, 4138: -376.19152013045505, 4139: -297.87340991156117, 4140: -376.19152013045505, 4141: -177.9021579188008, 4142: -1231.2829129764696, 4143: -3425.5935183938686, 4144: -884.1617599247998, 4145: -731.0606938348163, 4146: -814.5497078062709, 4147: -357.5517807927659, 4148: -814.5497078062709, 4149: -357.5517807927659, 4150: -3744.0628667347614, 4151: -205.368260021715, 4152: -3744.0628667347614, 4153: -2526.294493784164, 4154: -2526.294493784164, 4155: -429.30336615563107, 4156: -429.30336615563107, 4157: -1342.3478517352646, 4158: -2775.0614489526724, 4159: -1342.3478517352646, 4160: -147.0621846935121, 4161: -2775.0614489526724, 4162: -4278.647833560566, 4163: -4188.717936884311, 4164: -357.5517807927659, 4165: -120.5818345087279, 4166: -357.5517807927659, 4167: -6376.166279060639, 4168: -869.397825232456, 4169: -6376.166279060639, 4170: -869.397825232456, 4171: -1825.05856998268, 41

## Step 2: Get probabilities that any one email in the training data is either spam or ham

In the labelled dataset, count the number of spam and ham emails.

$P(Spam) = \frac{Spam\,Emails}{Total\,Emails}$

In [23]:
    spam_proportion = training_data['Prediction'].value_counts()[1] / training_data.shape[0]
    spam_proportion

0.28650870406189555

$P(Ham) = \frac{Ham\,Emails}{Total\,Emails}$

In [24]:
    ham_proportion = training_data['Prediction'].value_counts()[0] / training_data.shape[0]
    ham_proportion

0.7134912959381045

## Step 3: Get the "spamicity" and "hamicity" probability of each word in the testing data email

**w** = word
<br>**vocab** = total words in dataset
<br>**spam_vocab**
<br>**wi_spam_count**

Count all unique words in the labelled dataset to get **vocab**.

Count the total number of words in labelled spam emails (ignoring uniqueness) to get **spam_vocab**.

For each word **w**, count all instances of the word in the spam emails to get **wi_spam_count**.

Calculate spamicity of each word and store the word and its spamicity in a dictionary

$P(w_{i}|Spam) = \frac{wi\_spam\_count\,+\,\alpha}{spam\_vocab\,+\,\alpha \cdot vocab}$

$\alpha$ is a coefficient that prevents a probability from being 0.


In [25]:
    # Subtract 2 for "Email No." and "Prediction" columns
    total_vocab = len(training_data.columns) - 2
    total_vocab

2866

In [26]:
    spam_training_emails = training_data.loc[training_data[PREDICTION] == 1]

    total_spam_words = count_vocab(spam_training_emails)
    total_spam_words

991291

In [27]:
    ham_training_emails = training_data.loc[training_data[PREDICTION] == 0]

    total_ham_words = count_vocab(ham_training_emails)
    total_ham_words

1717739

In [28]:
    word_spamicities = build_word_spamicity_dict(spam_training_emails, total_vocab, total_spam_words)
    word_spamicities

{'ect': 0.0028285270837503534,
 'hou': 0.0006950612428419254,
 'enron': 1.0058773413052466e-06,
 'com': 0.00291603841244391,
 'gas': 0.00017602853472841815,
 'deal': 0.0001790461667523339,
 'meter': 4.0235093652209865e-05,
 'hpl': 5.029386706526233e-06,
 'please': 0.0003912862857677409,
 'e': 0.11701874050074586,
 'corp': 0.00014283458246534502,
 'know': 0.00019514020421321782,
 'need': 0.00029271030631982677,
 'forwarded': 2.011754682610493e-06,
 'new': 0.0006900318561353991,
 'may': 0.0003259042585828999,
 'j': 0.004370537047971296,
 'mmbtu': 1.0058773413052466e-06,
 'get': 0.0006206263195853371,
 'see': 0.00030176320239157396,
 'price': 0.0005602736791070224,
 'daren': 1.0058773413052466e-06,
 'company': 0.000514003321406981,
 'l': 0.0471625709017791,
 'let': 0.0006065440368070637,
 'would': 0.00015188747853709224,
 'xls': 3.0176320239157395e-06,
 'farmer': 1.810579214349444e-05,
 'attached': 1.1064650754357711e-05,
 'us': 0.003968186111449197,
 'information': 0.00040134505918079335

In [29]:
    word_hamicities = build_word_spamicity_dict(ham_training_emails, total_vocab, total_ham_words)
    word_hamicities

{'ect': 0.011792363732524316,
 'hou': 0.004966857587883332,
 'enron': 0.0032691989154977463,
 'com': 0.0017784442100307741,
 'gas': 0.0013373203030329448,
 'deal': 0.0015907195434164145,
 'meter': 0.0014146186951682694,
 'hpl': 0.0016209414711685715,
 'please': 0.0012815259748751166,
 'e': 0.12784398510988867,
 'corp': 0.0010095286251057042,
 'know': 0.0006956855292179205,
 'need': 0.0007939067944124305,
 'forwarded': 0.0006904548109531241,
 'new': 0.0005306273084176786,
 'may': 0.0003266292960906193,
 'j': 0.006427971556516458,
 'mmbtu': 0.0006584893104460349,
 'get': 0.00048064488944295756,
 'see': 0.000513772771786668,
 'price': 0.0003684750422089904,
 'daren': 0.0008607437500181622,
 'company': 0.00016854536631010603,
 'l': 0.044587223680042774,
 'let': 0.0007753086850264877,
 'would': 0.0003835860060850689,
 'xls': 0.0005062172898486288,
 'farmer': 0.0005602680452515249,
 'attached': 0.0005027301443387645,
 'us': 0.0027408963707533105,
 'information': 0.0002656042496679947,
 'mess

## Step 4: Calculate the "spamicity" and "hamicity" of each email

Multiply spamicities of each word together to get $\prod_{i=1}^{n}P(w_{i}|Spam)$.

Multiply that product by the probability that any email is spam.

In [30]:
    test_data_spam_map = calculate_emails(testing_data, word_spamicities, spam_proportion)
    test_data_spam_map


{4136: -20033.892207987486,
 4137: -1414.1801385844697,
 4138: -376.19152013045505,
 4139: -297.87340991156117,
 4140: -376.19152013045505,
 4141: -177.9021579188008,
 4142: -1231.2829129764696,
 4143: -3425.5935183938686,
 4144: -884.1617599247998,
 4145: -731.0606938348163,
 4146: -814.5497078062709,
 4147: -357.5517807927659,
 4148: -814.5497078062709,
 4149: -357.5517807927659,
 4150: -3744.0628667347614,
 4151: -205.368260021715,
 4152: -3744.0628667347614,
 4153: -2526.294493784164,
 4154: -2526.294493784164,
 4155: -429.30336615563107,
 4156: -429.30336615563107,
 4157: -1342.3478517352646,
 4158: -2775.0614489526724,
 4159: -1342.3478517352646,
 4160: -147.0621846935121,
 4161: -2775.0614489526724,
 4162: -4278.647833560566,
 4163: -4188.717936884311,
 4164: -357.5517807927659,
 4165: -120.5818345087279,
 4166: -357.5517807927659,
 4167: -6376.166279060639,
 4168: -869.397825232456,
 4169: -6376.166279060639,
 4170: -869.397825232456,
 4171: -1825.05856998268,
 4172: -2510.7094

In [31]:
    test_data_ham_map = calculate_emails(testing_data, word_hamicities, ham_proportion)
    test_data_ham_map

{4136: -20924.94366528413,
 4137: -1326.1433306251727,
 4138: -366.3894705381525,
 4139: -293.61298511264096,
 4140: -366.3894705381525,
 4141: -199.51963995974572,
 4142: -1230.6115639604932,
 4143: -3460.03491248256,
 4144: -832.1546611195292,
 4145: -679.3835698524464,
 4146: -743.974448297081,
 4147: -294.50682149977354,
 4148: -743.974448297081,
 4149: -294.50682149977354,
 4150: -3414.0440910779334,
 4151: -203.2369889572494,
 4152: -3414.0440910779334,
 4153: -2468.989105607814,
 4154: -2468.989105607814,
 4155: -407.89402585841276,
 4156: -407.89402585841276,
 4157: -1259.857887536808,
 4158: -2559.7344674685946,
 4159: -1259.857887536808,
 4160: -144.57518386709884,
 4161: -2559.7344674685946,
 4162: -4295.249012573737,
 4163: -4206.594788944447,
 4164: -294.50682149977354,
 4165: -119.45956290328891,
 4166: -294.50682149977354,
 4167: -5754.446968557761,
 4168: -807.8521320469671,
 4169: -5754.446968557761,
 4170: -807.8521320469671,
 4171: -1853.7616970289444,
 4172: -2333.6

## Step 5: Compare hamicity and spamicity scores to classify emails

In [32]:
    for key in test_data_spam_map:
        if test_data_spam_map[key] >= test_data_ham_map[key]:
            testing_data[CLASSIFICATION].loc[testing_data.index[key]] = 1
        else:
            testing_data[CLASSIFICATION].loc[testing_data.index[key]] = 0

IndexError: index 4136 is out of bounds for axis 0 with size 1034

## Step 6: Check accuracy of the model

In [None]:
    score = calculate_accuracy(testing_data)
    f'Accuracy: {score}%'