# Scikit-Learn Naive Bayes Text Classifier

Using Scikit-Learn's Naive Bayes module to distinguish between emails, e.g. how effective is the classifier at distinguishing between emails about hockey vs those about tech.

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

emails = fetch_20newsgroups()
print(emails.target_names) # display different categories

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [2]:
# examine the dataset
# emails = fetch_20newsgroups()
# print(emails.data[5])

# email labels
# print(emails.target[5])
# print(emails.target_names)

# categories of interest
categories = ['comp.os.ms-windows.misc', 'comp.windows.x']

In [3]:
# split data into training and test sets, adding a 'random_state' ensures that your 
# data is split in the same way each time you run your code
train_emails = fetch_20newsgroups(
  categories=categories, 
  subset='train',
  shuffle=True,
  random_state=108
)

test_emails = fetch_20newsgroups(
  categories=categories, 
  subset='test',
  shuffle=True,
  random_state=108
)

In [4]:
#  transform these emails into lists of word counts using the 'CountVectorizer' object
counter = CountVectorizer()

# create dataset with all the possible words that can exist in our emails
counter.fit(test_emails.data + train_emails.data)

# make a list of the counts of our words in our training & test set.
train_counts = counter.transform(train_emails.data)
test_counts = counter.transform(test_emails.data)

In [5]:
# scikit's NB classifier
classifier = MultinomialNB()

# train our model - takes 2 args, training_set & the labels associated 
# with the training emails
classifier.fit(train_counts, train_emails.target)

# measure the accuracy of the NB classifier using .score(), takes the the 
# test set and the test labels
# returns the percentage of classifications a classifier correctly made.
print(classifier.score(test_counts, test_emails.target))

0.5044359949302915


In [6]:
# TEST ===================================================================
# categories = ['rec.sport.baseball', 'rec.sport.hockey'] --> 0.9723618090452262 
# --> does a good job of distingushing between baseball and hock emails

# categories = ['comp.sys.ibm.pc.hardware', 'rec.sport.hockey'] --> 0.9974715549936789

# categories = ['sci.space', 'sci.electronics'] --> 0.9796696315120712

# categories = ['comp.os.ms-windows.misc', 'comp.windows.x'] --> 0.5044359949302915