# Coding a spam classifier with naive Bayes

### 1. Imports and pre-processing data

We load the data into a Pandas DataFrame, and then preprocess it by adding a string with the (non-repeated) lowercase words in the email.

In [217]:
import numpy as np

In [218]:
import pandas as pd
emails = pd.read_csv('emails.csv')

In [219]:
emails[:10]

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
5,"Subject: great nnews hello , welcome to medzo...",1
6,Subject: here ' s a hot play in motion homela...,1
7,Subject: save your money buy getting this thin...,1
8,Subject: undeliverable : home based business f...,1
9,Subject: save your money buy getting this thin...,1


In [220]:
def process_email(text):
    text = text.lower()
    return list(set(text.split()))

emails['words'] = emails['text'].apply(process_email)

In [221]:
emails[:10]

Unnamed: 0,text,spam,words
0,Subject: naturally irresistible your corporate...,1,"[specially, surethat, without, in, no, easier,..."
1,Subject: the stock trading gunslinger fanny i...,1,"[trading, like, segovia, group, incredible, tr..."
2,Subject: unbelievable new homes made easy im ...,1,"[loan, in, factor, visit, way, no, of, at, cre..."
3,Subject: 4 color printing special request add...,1,"[irwindale, !, special, printable, rd, mail, :..."
4,"Subject: do not have money , get software cds ...",1,"[tradgedies, !, yet, cds, d, great, ain, old, ..."
5,"Subject: great nnews hello , welcome to medzo...",1,"[total, !, welcome, andmanyother, 5, in, ieadi..."
6,Subject: here ' s a hot play in motion homela...,1,"[looking, aiready, results, mind, advancing, w..."
7,Subject: save your money buy getting this thin...,1,"[like, 36, !, iasts, hours, can, yet, in, adva..."
8,Subject: undeliverable : home based business f...,1,"[32, following, (, recognized, 0, 7059, c, 23,..."
9,Subject: save your money buy getting this thin...,1,"[like, 36, !, hours, can, yet, in, advantages,..."


In [222]:
num_emails = len(emails)
num_spam = sum(emails['spam'])

print("Number of emails:", num_emails)
print("Number of spam emails:", num_spam)
print()

# Calculating the prior probability that an email is spam
print("Probability of spam:", num_spam/num_emails)

Number of emails: 5728
Number of spam emails: 1368

Probability of spam: 0.2388268156424581


### 2. Training a naive Bayes model

Our plan is to write a dictionary, and in this dictionary record every word, and its pair of occurrences in spam and ham

In [223]:
model = {}

# Training process
for index, email in emails.iterrows():
    for word in email['words']:
        if word not in model:
            model[word] = {'spam': 1, 'ham': 1}
        if word in model:
            if email['spam']:
                model[word]['spam'] += 1
            else:
                model[word]['ham'] += 1

In [224]:
model['lottery']

{'spam': 9, 'ham': 1}

In [225]:
model['sale']

{'spam': 39, 'ham': 42}

### 3. Using the model to make predictions

In [226]:
def predict_bayes(word):
    word = word.lower()
    num_spam_with_word = model[word]['spam']
    num_ham_with_word = model[word]['ham']
    return 1.0*num_spam_with_word/(num_spam_with_word + num_ham_with_word)

In [227]:
predict_bayes('lottery')

0.9

In [228]:
predict_bayes('sale')

0.48148148148148145

In [229]:
def predict_naive_bayes(email):
    total = len(emails)
    num_spam = sum(emails['spam'])
    num_ham = total - num_spam
    email = email.lower()
    words = set(email.split())
    spams = [1.0]
    hams = [1.0]
    for word in words:
        if word in model:
            spams.append(model[word]['spam']/num_spam*total)
            hams.append(model[word]['ham']/num_ham*total)
    prod_spams = np.long(np.prod(spams)*num_spam)
    prod_hams = np.long(np.prod(hams)*num_ham)
    print((prod_spams + prod_hams))
    return prod_spams/(prod_spams + prod_hams)

In [230]:
predict_naive_bayes('lottery sale')

8734410


np.float64(0.9638144992048691)

In [231]:
predict_naive_bayes('Hi mom how are you')

0


  prod_spams = np.long(np.prod(spams)*num_spam)
  prod_hams = np.long(np.prod(hams)*num_ham)
  print((prod_spams + prod_hams))
  return prod_spams/(prod_spams + prod_hams)
  return prod_spams/(prod_spams + prod_hams)


np.float64(-inf)

In [232]:
predict_naive_bayes('Hi MOM how aRe yoU afdjsaklfsdhgjasdhfjklsd')

0


  prod_spams = np.long(np.prod(spams)*num_spam)
  prod_hams = np.long(np.prod(hams)*num_ham)
  print((prod_spams + prod_hams))
  return prod_spams/(prod_spams + prod_hams)
  return prod_spams/(prod_spams + prod_hams)


np.float64(-inf)

In [233]:
predict_naive_bayes('meet me at the lobby of the hotel at nine am')

0


  prod_spams = np.long(np.prod(spams)*num_spam)
  prod_hams = np.long(np.prod(hams)*num_ham)
  print((prod_spams + prod_hams))
  return prod_spams/(prod_spams + prod_hams)
  return prod_spams/(prod_spams + prod_hams)


np.float64(-inf)

In [234]:
predict_naive_bayes('enter the lottery to win three million dollars')

0


  prod_spams = np.long(np.prod(spams)*num_spam)
  prod_hams = np.long(np.prod(hams)*num_ham)
  print((prod_spams + prod_hams))
  return prod_spams/(prod_spams + prod_hams)
  return prod_spams/(prod_spams + prod_hams)


np.float64(-inf)

In [235]:
predict_naive_bayes('buy cheap lottery easy money now')

0


  prod_spams = np.long(np.prod(spams)*num_spam)
  prod_hams = np.long(np.prod(hams)*num_ham)
  print((prod_spams + prod_hams))
  return prod_spams/(prod_spams + prod_hams)
  return prod_spams/(prod_spams + prod_hams)


np.float64(-inf)

In [236]:
predict_naive_bayes('Grokking Machine Learning by Luis Serrano')

0


  prod_spams = np.long(np.prod(spams)*num_spam)
  prod_hams = np.long(np.prod(hams)*num_ham)
  print((prod_spams + prod_hams))
  return prod_spams/(prod_spams + prod_hams)
  return prod_spams/(prod_spams + prod_hams)


np.float64(-inf)

In [237]:
predict_naive_bayes('asdfgh')

5728


np.float64(0.2388268156424581)

# Naive Bayes using Scikit-learn

In [238]:
import sklearn as sk

In [239]:
emails = pd.read_csv('emails.csv')
emails

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [240]:
train_x, test_x, train_y, test_y = sk.model_selection.train_test_split(emails["text"], emails["spam"], test_size=0.2)

In [241]:
text_vectorizer = sk.feature_extraction.text.CountVectorizer()
train_x = text_vectorizer.fit_transform(train_x)

In [242]:
test_x = text_vectorizer.transform(test_x)

In [243]:
classifier = sk.naive_bayes.MultinomialNB()
classifier.fit(train_x, train_y)

In [244]:
classifier.predict(test_x)

array([0, 0, 1, ..., 0, 0, 0], shape=(1146,))

In [245]:
print(classifier.score(test_x, test_y))

0.993891797556719


Получили неплохой скор(точность), посмотрим полноту

In [246]:
prediction = classifier.predict(test_x)
recall = sk.metrics.recall_score(test_y, prediction)
recall

0.9884169884169884