# Coding a spam classifier with naive Bayes

### Importing packages

In [1]:
import numpy as np

### Loading the dataset
For the next two cells, please run only one of them.
- Run the first cell if you cloned the Github Repo
- Run the second cell if you opened this as a Google Colab

In [2]:
# IMPORTANT: ONLY RUN THIS CELL IF YOU HAVE CLONED THE REPO
import pandas as pd
emails = pd.read_csv('emails.csv')
emails[:10]

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [None]:
# IMPORTANT: ONLY RUN THIS CELL IF YOU ARE WORKING ON A COLAB 

url = "https://raw.githubusercontent.com/luisguiserrano/manning/master/Chapter_08_Naive_Bayes/emails.csv"
data = pd.read_csv(url)
data.head()

In [3]:
def process_email(text):
    text = text.lower()
    return list(set(text.split()))

emails['words'] = emails['text'].apply(process_email)

In [4]:
emails[:10]

Unnamed: 0,text,spam,words
0,Subject: naturally irresistible your corporate...,1,"[., clear, original, our, :, distinctive, busi..."
1,Subject: the stock trading gunslinger fanny i...,1,"[perspicuous, huzzah, inflexible, continuant, ..."
2,Subject: unbelievable new homes made easy im ...,1,"[454, ., to, 3, wanting, we, you, loan, our, p..."
3,Subject: 4 color printing special request add...,1,"[., an, printing, goldengraphix, our, e, irwin..."
4,"Subject: do not have money , get software cds ...",1,"[., it, cds, ?, to, compatibility, along, ', d..."
5,"Subject: great nnews hello , welcome to medzo...",1,"[introduce, ., 75, worldwide, total, op, v, to..."
6,Subject: here ' s a hot play in motion homela...,1,"[transportation, innovative, *, adverse, excha..."
7,Subject: save your money buy getting this thin...,1,"[imagine, it, ., has, viagra, iasts, ?, buy, t..."
8,Subject: undeliverable : home based business f...,1,"[75, ., original, :, business, undeliverable, ..."
9,Subject: save your money buy getting this thin...,1,"[imagine, it, ., has, viagra, ?, buy, to, 10, ..."


In [5]:
num_emails = len(emails)
num_spam = sum(emails['spam'])

print("Number of emails:", num_emails)
print("Number of spam emails:", num_spam)
print()

# Calculating the prior probability that an email is spam
print("Probability of spam:", num_spam/num_emails)

Number of emails: 5728
Number of spam emails: 1368

Probability of spam: 0.2388268156424581


### Training a naive Bayes model

Our plan is to write a dictionary, and in this dictionary record every word, and its pair of occurrences in spam and ham

In [6]:
model = {}

# Training process
for index, email in emails.iterrows():
    for word in email['words']:
        if word not in model:
            model[word] = {'spam': 1, 'ham': 1}
        if word in model:
            if email['spam']:
                model[word]['spam'] += 1
            else:
                model[word]['ham'] += 1

In [7]:
model['lottery']

{'spam': 9, 'ham': 1}

In [8]:
model['sale']

{'spam': 39, 'ham': 42}

### Using the model to make predictions

In [9]:
def predict_bayes(word):
    word = word.lower()
    num_spam_with_word = model[word]['spam']
    num_ham_with_word = model[word]['ham']
    return 1.0*num_spam_with_word/(num_spam_with_word + num_ham_with_word)

In [10]:
predict_bayes('lottery')

0.9

In [11]:
predict_bayes('sale')

0.48148148148148145

In [16]:
def predict_naive_bayes(email):
    total = len(emails)
    num_spam = sum(emails['spam'])
    num_ham = total - num_spam
    email = email.lower()
    words = set(email.split())
    spams = [1.0]
    hams = [1.0]
    for word in words:
        if word in model:
            spams.append(model[word]['spam']/num_spam*total)
            hams.append(model[word]['ham']/num_ham*total)
    prod_spams = np.compat.long(np.prod(spams)*num_spam)
    prod_hams = np.compat.long(np.prod(hams)*num_ham)
    return prod_spams/(prod_spams + prod_hams)

In [17]:
predict_naive_bayes('lottery sale')

0.9638144992048691

In [18]:
predict_naive_bayes('Hi mom how are you')

0.12554358867164464

In [19]:
predict_naive_bayes('Hi MOM how aRe yoU afdjsaklfsdhgjasdhfjklsd')

0.12554358867164464

In [20]:
predict_naive_bayes('meet me at the lobby of the hotel at nine am')

6.964603508395961e-05

In [21]:
predict_naive_bayes('enter the lottery to win three million dollars')

0.9995234218677428

In [22]:
predict_naive_bayes('buy cheap lottery easy money now')

0.999973472265966

In [23]:
predict_naive_bayes('Grokking Machine Learning by Luis Serrano')

0.4197107645488719

In [24]:
predict_naive_bayes('asdfgh')

0.2388268156424581