# Bayes Theorem for Predicting the Probability of an Email Being Spam

S = Spam
w = Word

$P(Spam|w_{1}, w_{2},..., w_{n}) \propto P(Spam) \cdot \prod_{i=1}^{n}P(w_{i}|Spam)$

The probability that an email consisting of the words $w_{1}, w_{2},... w_{n}$ is proportional to the probability that any given email is spam multiplied by the product of each word's probability to appear in a spam email.



In [1]:
import pandas as pd
import math

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

pd.options.mode.chained_assignment = None  # default='warn'

PREDICTION = 'Prediction'
CLASSIFICATION = 'Classiciation'

## Functions

In [2]:
def count_vocab(emails):
    total_words = 0
    
    for index, row in emails.iterrows():
        total_words += sum(row.values[1:-2])
            
    return total_words

In [3]:
def calculate_word_spamicity(w_spam_count, vocab, spam_vocab):
    alpha = 1
    
    spamicity = (w_spam_count + alpha) / (spam_vocab + alpha * vocab)
    return spamicity

In [4]:
def build_word_spamicity_dict(spam_emails, vocab, spam_vocab):
    spam_word_appearances = {}
    
    for (column_name, column_data) in spam_emails.iteritems():
        if column_name != 'Email No.' and column_name != PREDICTION and column_name != CLASSIFICATION:
            spam_word_appearances[column_name] = sum(column_data.values)
            
    for word in spam_word_appearances:
        spam_word_appearances[word] = calculate_word_spamicity(spam_word_appearances[word], vocab, spam_vocab)
            
    return spam_word_appearances

In [5]:
def calculate_emails(testing_emails, word_spamicities, spam_proportion):
    test_data_map = {}
    for index, email in testing_emails.iterrows():
        email_spamicity = math.log(spam_proportion)
        for column in testing_data.columns[1:-2]:
            if email[column] > 0:
                # log P(spam) + sum log P(w|spam)
                email_spamicity += math.log(word_spamicities[column])*email[column]

        test_data_map[index] = email_spamicity
        
    return test_data_map

In [6]:
def calculate_accuracy(testing_emails):
    number_correct = 0
    for index, email in testing_emails.iterrows():
        if email[PREDICTION] == email[CLASSIFICATION]:
            number_correct += 1
        
    return number_correct / testing_data.shape[0] * 100

# Model

## Step 1: Partition the data into training and test segments

20% of the data for testing, and the remaining 80% is training (i.e. the 80% training data will confirm whether the 20% testing data labels are correct).

In [7]:
df = pd.read_csv('emails.csv')
total_num_emails = df.shape[0]

end = total_num_emails//5

# Take out all stopwords
for col in df.columns:
    if col in stop_words:
        df.drop(col, axis=1, inplace=True)

testing_data = df.iloc[:end].copy()

# This is where the model's prediction will be stored
testing_data[CLASSIFICATION] = ""

training_data = df.iloc[end:]

testing_data


Unnamed: 0,Email No.,ect,hou,enron,com,gas,deal,meter,hpl,please,...,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction,Classiciation
0,Email 1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
1,Email 2,24,27,1,3,1,0,0,0,2,...,0,0,0,0,0,0,1,0,0,
2,Email 3,1,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,
3,Email 4,22,10,0,0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,
4,Email 5,17,9,0,0,2,0,3,0,1,...,0,0,0,0,0,0,1,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029,Email 1030,4,0,0,2,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,
1030,Email 1031,18,11,59,7,0,0,0,0,4,...,0,0,2,0,0,0,2,0,0,
1031,Email 1032,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,
1032,Email 1033,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,2,0,1,


In [8]:
training_data

Unnamed: 0,Email No.,ect,hou,enron,com,gas,deal,meter,hpl,please,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
1034,Email 1035,18,0,2,7,0,0,0,0,0,...,0,0,0,0,0,0,0,3,0,0
1035,Email 1036,3,0,0,0,0,0,0,3,0,...,0,0,0,0,0,0,0,0,0,0
1036,Email 1037,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1037,Email 1038,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1038,Email 1039,11,4,0,0,4,6,11,3,3,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5167,Email 5168,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5168,Email 5169,11,3,1,3,5,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5169,Email 5170,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5170,Email 5171,1,0,0,4,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,1


## Step 2: Get probabilities that any one email in the training data is either spam or ham

In the labelled dataset, count the number of spam and ham emails.

$P(Spam) = \frac{Spam\,Emails}{Total\,Emails}$

In [9]:
spam_proportion = training_data['Prediction'].value_counts()[1] / training_data.shape[0]
spam_proportion

0.291445142580957

$P(Spam) = \frac{Ham\,Emails}{Total\,Emails}$

In [10]:
ham_proportion = training_data['Prediction'].value_counts()[0] / training_data.shape[0]
ham_proportion

0.708554857419043

## Step 3: Get the "spamicity" probability of each word in the testing data email

**w** = word
<br>**vocab** = total words in dataset
<br>**spam_vocab**
<br>**wi_spam_count**

Count all unique words in the labelled dataset to get **vocab**.

Count the total number of words in labelled spam emails (ignoring uniqueness) to get **spam_vocab**.

For each word **w**, count all instances of the word in the spam emails to get **wi_spam_count**.

Calculate spamicity of each word and store the word and its spamicity in a dictionary

$P(w_{i}|Spam) = \frac{wi\_spam\_count\,+\,\alpha}{spam\_vocab\,+\,\alpha \cdot vocab}$

$\alpha$ is a coefficient that prevents a probability from being 0.


In [11]:
# Subtract 2 for "Email No." and "Prediction" columns
total_vocab = len(training_data.columns) - 3
total_vocab

2865

In [12]:
spam_training_emails = training_data.loc[training_data[PREDICTION] == 1]

total_spam_words = count_vocab(spam_training_emails)
total_spam_words

987979

In [13]:
ham_training_emails = training_data.loc[training_data[PREDICTION] == 0]

total_ham_words = count_vocab(ham_training_emails)
total_ham_words

1754023

In [14]:
word_spamicities = build_word_spamicity_dict(spam_training_emails, total_vocab, total_spam_words)
word_spamicities

{'ect': 0.003092313219840863,
 'hou': 0.0007559212146412553,
 'enron': 1.0092406069976707e-06,
 'com': 0.003500046425067922,
 'gas': 0.00023111609900246659,
 'deal': 0.00020184812139953414,
 'meter': 3.532342124491847e-05,
 'hpl': 3.027721820993012e-06,
 'please': 0.00042489029554601933,
 'e': 0.11977869371969756,
 'corp': 0.00018368179047357607,
 'know': 0.0002068943244345225,
 'need': 0.00031589230999027094,
 'forwarded': 2.0184812139953414e-06,
 'new': 0.0007983093201351575,
 'may': 0.0004117701676550496,
 'j': 0.004014759134636734,
 'mmbtu': 1.0092406069976707e-06,
 'get': 0.0006438955072645139,
 'see': 0.00036736358094715215,
 'price': 0.0006126090484475861,
 'daren': 1.0092406069976707e-06,
 'company': 0.0006842651315444207,
 'l': 0.04481331067251757,
 'let': 0.0005873780332726444,
 'would': 0.0001756078656175947,
 'xls': 3.027721820993012e-06,
 'farmer': 1.2110887283972048e-05,
 'attached': 1.2110887283972048e-05,
 'us': 0.0038785116526920483,
 'information': 0.00044507510768597

In [15]:
word_hamicities = build_word_spamicity_dict(ham_training_emails, total_vocab, total_ham_words)
word_hamicities

{'ect': 0.00957374630596828,
 'hou': 0.003967811266284476,
 'enron': 0.00320566820423385,
 'com': 0.002312042657243945,
 'gas': 0.0013734512387812996,
 'deal': 0.0015834817017362519,
 'meter': 0.001171389411277213,
 'hpl': 0.0016273091967160115,
 'please': 0.001186188305685963,
 'e': 0.12650038021774865,
 'corp': 0.0008219078279321163,
 'know': 0.0006431827185341354,
 'need': 0.0007245766377822605,
 'forwarded': 0.0005321910104685103,
 'new': 0.0004792565035449044,
 'may': 0.0004838100095168275,
 'j': 0.006497283833687748,
 'mmbtu': 0.0006067546707587507,
 'get': 0.00048153325653086597,
 'see': 0.0005504050343562026,
 'price': 0.00045364303245283707,
 'daren': 0.0008811034055671164,
 'company': 0.00022995205158211565,
 'l': 0.04501994435615702,
 'let': 0.0007109161198664912,
 'would': 0.0003790793721625966,
 'xls': 0.0005117002335948564,
 'farmer': 0.0006033395412798083,
 'attached': 0.0005264991280036063,
 'us': 0.0028624476916001475,
 'information': 0.0002806098055197599,
 'message':

## Step 4: Calculate the "spamicity" of the email

Multiply spamicities of each word together to get $\prod_{i=1}^{n}P(w_{i}|Spam)$.

Multiply that product by the probability that any email is spam.

In [16]:
test_data_spam_map = calculate_emails(testing_data, word_spamicities, spam_proportion)
test_data_spam_map
        

{0: -179.61432490192018,
 1: -6740.299004313841,
 2: -399.3454046464428,
 3: -3161.236800358932,
 4: -3182.850632258912,
 5: -3640.9152859930027,
 6: -1904.1300681210791,
 7: -1485.914384481864,
 8: -1359.6331776195916,
 9: -3810.8478602100377,
 10: -5672.469100679174,
 11: -10719.805406608773,
 12: -6476.893440993861,
 13: -3280.4274236490346,
 14: -1991.5813269507225,
 15: -2202.1062990420423,
 16: -535.4639019188824,
 17: -11293.797726987428,
 18: -1164.8083289376239,
 19: -2600.014329810607,
 20: -818.4708445376772,
 21: -2247.506810113112,
 22: -2878.4902093950022,
 23: -878.4811861663345,
 24: -543.0075469050573,
 25: -12673.695435577505,
 26: -4100.642519687362,
 27: -1225.941643243431,
 28: -5629.353251030424,
 29: -6315.000207294861,
 30: -1893.083646427277,
 31: -470.4723593430032,
 32: -12576.846137129274,
 33: -3804.203354169468,
 34: -2820.8433038777966,
 35: -1640.8623436966002,
 36: -3812.630472463249,
 37: -1361.9100180101573,
 38: -1575.9630019555034,
 39: -2902.122030

In [17]:
test_data_ham_map = calculate_emails(testing_data, word_hamicities, ham_proportion)
test_data_ham_map

{0: -176.1558197276228,
 1: -6432.040926243978,
 2: -357.4125072309575,
 3: -2932.9003762387442,
 4: -2913.709629296239,
 5: -3752.1895701727603,
 6: -1801.0950681730978,
 7: -1564.2417449495445,
 8: -1304.0298736554957,
 9: -3591.05565480693,
 10: -5412.766873803432,
 11: -10351.138292196694,
 12: -6327.840074164931,
 13: -3088.7977330495573,
 14: -1826.4557781774565,
 15: -2134.5749365293527,
 16: -533.7845039865206,
 17: -11515.18238563699,
 18: -1095.4809534490412,
 19: -2394.443028803516,
 20: -769.3621065024284,
 21: -2143.448049792175,
 22: -2573.9186680346565,
 23: -840.2932365736452,
 24: -494.24651965265286,
 25: -13184.325380481701,
 26: -3920.921664650669,
 27: -1154.723351204787,
 28: -5454.799319145284,
 29: -6033.216248103177,
 30: -1862.820790556962,
 31: -488.7391411044878,
 32: -11877.2600497202,
 33: -3576.0898426953004,
 34: -2688.6323941868072,
 35: -1554.9051630886188,
 36: -3649.6700298780916,
 37: -1256.3367958928202,
 38: -1529.3516605789544,
 39: -2955.0811985

In [18]:
for key in test_data_spam_map:
    if test_data_spam_map[key] >= test_data_ham_map[key]:
        testing_data[CLASSIFICATION].loc[testing_data.index[key]] = 1
    else:
        testing_data[CLASSIFICATION].loc[testing_data.index[key]] = 0
        
testing_data

Unnamed: 0,Email No.,ect,hou,enron,com,gas,deal,meter,hpl,please,...,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction,Classiciation
0,Email 1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,24,27,1,3,1,0,0,0,2,...,0,0,0,0,0,0,1,0,0,0
2,Email 3,1,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,22,10,0,0,0,2,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,17,9,0,0,2,0,3,0,1,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1029,Email 1030,4,0,0,2,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0
1030,Email 1031,18,11,59,7,0,0,0,0,4,...,0,0,2,0,0,0,2,0,0,0
1031,Email 1032,1,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1032,Email 1033,1,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,2,0,1,1


In [19]:
score = calculate_accuracy(testing_data)
f'Accuracy: {score}%'

'Accuracy: 93.23017408123792%'

## Step 5: Repeat steps 2-4 calculating the "hamicity" of each email

## Step 6: Label emails in test dataset and compare

A probability greater then 0.5 will indicate whether an email is ham or spam.