# Naive Bayes (the easy way)

We'll cheat by using sklearn.naive_bayes to train a spam classifier! Most of the code is just loading our training data into a pandas DataFrame that we can play with:

In [12]:
import os
import io
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []})

data = data.append(dataFrameFromDirectory('emails/spam','spam'))
data = data.append(dataFrameFromDirectory('emails/ham','ham'))


Let's have a look at that DataFrame:

In [13]:
data.head(2)

Unnamed: 0,class,message
emails/spam\00001.7848dde101aa985090474a91ec93fcf0,spam,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr..."
emails/spam\00002.d94f1b97e48ed3b553b3508d116e6a09,spam,1) Fight The Risk of Cancer!\n\nhttp://www.adc...


Now we will use a CountVectorizer to split up each message into its list of words, and throw that into a MultinomialNB classifier. Call fit() and we've got a trained spam filter ready to go! It's just that easy.

In [7]:
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

Let's try it out:

In [8]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'],
      dtype='<U4')

## Activity

Our data set is small, so our spam classifier isn't actually very good. Try running some different test emails through it and see if you get the results you expect.

If you really want to challenge yourself, try applying train/test to this spam classifier - see how well it can predict some subset of the ham and spam emails.

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(train['message'].values)
classifier = MultinomialNB()
targets = train['class'].values
classifier.fit(counts, targets)
examples = test['message']
example_counts = vectorizer.transform(examples)
final= DataFrame({'message': test['message'], 'class': test['class'], 'prdct':classifier.predict(example_counts)})
final['status']=final['class']==final['prdct']
final.groupby('status').count()

Unnamed: 0_level_0,class,message,prdct
status,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,30,30,30
True,570,570,570


In [18]:
vectorizer = CountVectorizer()
 
counts = vectorizer.fit_transform(data['message'].values)
 
classifier = MultinomialNB()
targets = data['class'].values
 
#Create .75 train/ .25 test split
X_train, X_test, y_train, y_test = train_test_split(counts, targets, test_size=0.25, random_state=35)
 
classifier.fit(X_train, y_train)
 
#Measure accuracy of classifier on test data
classifier.score(X_test, y_test, sample_weight=None)

0.95999999999999996

In [32]:
# Now split data into training (80 %) and test data sets (20 %) - TRAINING
train, test   = train_test_split(data, test_size=0.2)
train_counts  = vectorizer.fit_transform(train['message'].values)
targets       = train['class'].values
classifier.fit(train_counts, targets)

# Now test on TEST data
examples = test['message']
examples = np.array(examples)
#test_counts = vectorizer.transform(test)
test_counts = vectorizer.transform(examples)
predictions = classifier.predict(test_counts)
print (predictions)

test         = test[['class']]
test['pred'] = predictions

print ("Length of test data set       :{}".format(len(test)))
print ("% of correct classifications  :{}".format(100*len(test[test['class'] == test['pred']])/len(test)))
print ("% of incorrect classifications:{}".format(100. * len(test[(test['pred'] == 'spam') & (test['class']=='ham')]) /len(test[test['pred']=='spam'])))

['ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'spam' 'ham' 'ham'
 'ham' 'spam' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'spam' 'ham' 'spam' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'spam'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham' 'ham' 'ham' 'ham' 'spam' 'ham'
 'spam' 'ham' 'ham' 'ham' 'spam' 'ham' 'spam' 'spam' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham'
 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ham' 'ha

In [31]:
#Number classified as spam
Length_ClassifiedAsSpam = len(test[test['pred']== 'spam'])

#Probability of being ham given that it is classified as spam
P_ham_given_classified_as_spam = 100. * len(test[(test['pred'] == 'spam') & (test['class'] == 'ham')]) / Length_ClassifiedAsSpam

#print result to screen
print ("P(ham|classified_as_spam) = " , P_ham_given_classified_as_spam, "%")

P(ham|classified_as_spam) =  1.2345679012345678 %
