# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [1]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('emails/spam', 'spam'))
data = data.append(dataFrameFromDirectory('emails/ham', 'ham'))


Let's have a look at that DataFrame:

In [4]:
data.head()

Unnamed: 0,message,class
emails/spam/00400.cc74b7994a7282f32ee2a3b7e3634d31,Our delightful garden ornaments combine the fi...,spam
emails/spam/00303.22239f1393297a691eb5df3dfe7a5001,------=_NextPart_000_00E4_17A73C2D.E7104E07\n\...,spam
emails/spam/00157.52b0a260de7c64f539b0e5d16198b5bf,I am Mr.IKE EJOH. Bank Manager of Diamond Bank...,spam
emails/spam/00233.a268478ca6f03604012ffff8dd3de396,<HTML><HEAD><TITLE></TITLE><META http-equiv=3D...,spam
emails/spam/00401.309e29417819ce39d8599047d50933cc,A great sponsor will not make you money.\n\nA ...,spam


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [5]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Let's try it out:

In [11]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'], dtype='<U4')

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split

data.tail()

Unnamed: 0,message,class
emails/ham/00076.5aa682e393bfbef53e244acf3b2d23d6,"On Fri, 23 Aug 2002, Tom wrote:\n\n\n\n--]\n\n...",ham
emails/ham/00629.370fec99ddca8da57ef5cb0bf30375e5,"Church, AA, same diff?\n\n\n\n;-).\n\n\n\nChee...",ham
emails/ham/00828.709e1ec58a2bf04455cdf5c0c83f444c,"Hello Bill,\n\n\n\nMonday, September 30, 2002,...",ham
emails/ham/00724.06d186a9890c1bc07b1e0bd89b7efb8f,">>>>> ""J"" == Jim Whitehead <ejw@cse.ucsc.edu> ...",ham
emails/ham/00424.9975dd35a0bc8834d9ccd7dfb27ae7e6,\n\nOur preschoolers (2 and 4) use Winamp with...,ham


In [63]:
# shuffle your dataframe in-place and reset the index
# Here, specifying drop=True prevents .reset_index from creating a column containing the old index entries.
data2 = data.sample(frac=1).reset_index(drop=True)
data2.head()

Unnamed: 0,message,class
0,"On Tue, 1 Oct 2002, Angles Puglisi wrote:\n\n\...",ham
1,1) Lose 22.5lbs in 3 weeks!\n\nFlush Fat Away ...,spam
2,"On Wed, 18 Sep 2002, Tom wrote:\n\n\n\n> The o...",ham
3,"URL: http://www.newsisfree.com/click/-1,862212...",ham
4,This is a multi-part message in MIME format.\n...,spam


In [64]:
# create training and testing vars
Msg_train, Msg_test = train_test_split(data2, test_size=0.2)
print(Msg_train.shape)
print(Msg_test.shape)

(2400, 2)
(600, 2)


In [65]:
Msg_test.head()

Unnamed: 0,message,class
1826,> Gary Lawrence Murphy wrote:\n\n> >and say he...,ham
298,Scott MacKenzie wrote:\n\n\n\n>There is a soft...,ham
1128,--==_Exmh_1920300774P\n\nContent-Type: text/pl...,ham
384,"URL: http://www.newsisfree.com/click/-6,857278...",ham
292,"On 14:22 29 Aug 2002, Matthias Saou <matthias@...",ham


In [66]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(Msg_train['message'].values)

classifier = MultinomialNB()
targets = Msg_train['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [67]:
examples = Msg_test['message'].head()
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['ham', 'ham', 'ham', 'ham', 'ham'], dtype='<U4')

#### Hence verified by Msg_test.head() given and predicted data is same