In [1]:
# Load the Newsgroup Data
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
dir(texts)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [2]:
# 11,314 posts
print(len(texts.target))
print(texts.target)
print(texts.target_names)

11314
[7 4 4 ... 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [3]:
# Mapping of classes to numbers
for i in range(20):
    print(i, texts.target_names[i])

0 alt.atheism
1 comp.graphics
2 comp.os.ms-windows.misc
3 comp.sys.ibm.pc.hardware
4 comp.sys.mac.hardware
5 comp.windows.x
6 misc.forsale
7 rec.autos
8 rec.motorcycles
9 rec.sport.baseball
10 rec.sport.hockey
11 sci.crypt
12 sci.electronics
13 sci.med
14 sci.space
15 soc.religion.christian
16 talk.politics.guns
17 talk.politics.mideast
18 talk.politics.misc
19 talk.religion.misc


In [4]:
texts.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [5]:
from textblob.tokenizers import word_tokenize
print(list(word_tokenize(texts.data[0].lower())))

['from', ':', 'lerxst', '@', 'wam.umd.edu', '(', 'where', "'s", 'my', 'thing', ')', 'subject', ':', 'what', 'car', 'is', 'this', '!', '?', 'nntp-posting-host', ':', 'rac3.wam.umd.edu', 'organization', ':', 'university', 'of', 'maryland', ',', 'college', 'park', 'lines', ':', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', '.', 'it', 'was', 'a', '2-door', 'sports', 'car', ',', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/', 'early', '70s', '.', 'it', 'was', 'called', 'a', 'bricklin', '.', 'the', 'doors', 'were', 'really', 'small', '.', 'in', 'addition', ',', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', '.', 'this', 'is', 'all', 'i', 'know', '.', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', ',', 'engine', 'specs', ',', 'years', 'of', 'production', ',', 'where', 'this', 'car', 'is', 'made', ',', 'history', ',', 'or', 'whatever', 'inf

In [6]:
num_training = 10000
num_testing = 300

# first get vocabulary. We are creating a vocabulary to limit the features,
# since each word will eventually be a feature.
# https://docs.python.org/2/library/collections.html#collections.Counter
all_text = ''
for i in range(num_training):
    all_text += texts.data[i].lower()

# make a list of words, we need to tokenize ourselves to get this list
from nltk.tokenize import word_tokenize
tokens = word_tokenize(all_text)
tokens = [token.lower() for token in tokens]

# get the most common words
import collections
cnt = collections.Counter(tokens).most_common()
vocab = [k for k,v in cnt if v >= 10]
print("The size of vocabulary is: ", len(vocab))
print(vocab[:20])
# checking in a set is much faster than checking in a list
vocab = set(vocab)

# Now create the training and testing data, filtering out words not in our vocabulary
# This is important because each word is a feature, and you don't want too many features
training_data = []
for i in range(num_training):
    tokens = word_tokenize(texts.data[i])
    item_text = ' '.join([t.lower() for t in tokens if t.lower() in vocab])
    training_data.append((item_text, texts.target_names[texts.target[i]]))
testing_data = []
for i in range(num_training, num_training + num_testing):
    tokens = word_tokenize(texts.data[i])
    item_text = ' '.join([t.lower() for t in tokens if t.lower() in vocab])
    testing_data.append((item_text, texts.target_names[texts.target[i]]))
print(training_data[0])

The size of vocabulary is:  18279
['>', ',', 'the', '.', '--', ':', 'to', '(', ')', "'ax", 'of', '*', 'a', 'and', '@', 'i', 'in', 'is', 'that', '?']
("from : @ wam.umd.edu ( where 's my thing ) subject : what car is this ! ? nntp-posting-host : organization : university of maryland , college park lines : 15 i was wondering if anyone out there could enlighten me on this car i saw the other day . it was a sports car , looked to be from the late early 70s . it was called a . the doors were really small . in addition , the front bumper was separate from the rest of the body . this is all i know . if anyone can a model name , engine specs , years of production , where this car is made , history , or whatever info you have on this looking car , please e-mail . thanks , - il -- -- brought to you by your neighborhood -- --", 'rec.autos')


In [7]:
# The standard TextBlob Naive Bayes Classifier re-parses the whole text of the corpus for each record.
# This makes it slow.
from textblob.classifiers import NaiveBayesClassifier

# Train. Takes a while
cl = NaiveBayesClassifier(training_data)

In [8]:
# Shows what the features look like and what the important ones are
# Very helpful for debugging and understanding data
cl.show_informative_features(20)

Most Informative Features
       contains(windows) = True           comp.o : rec.sp =    235.0 : 1.0
          contains(sale) = True           misc.f : comp.w =    208.4 : 1.0
           contains(car) = True           rec.au : comp.w =    194.3 : 1.0
           contains(dod) = True           rec.mo : comp.w =    186.6 : 1.0
contains(nntp-posting-host) = True           talk.p : soc.re =    180.7 : 1.0
       contains(clipper) = True           sci.cr : misc.f =    180.2 : 1.0
          contains(chip) = True           sci.cr : sci.sp =    164.7 : 1.0
          contains(bike) = True           rec.mo : rec.sp =    162.2 : 1.0
    contains(encryption) = True           sci.cr : sci.el =    155.4 : 1.0
          contains(team) = True           rec.sp : rec.au =    151.0 : 1.0
           contains(gun) = True           talk.p : rec.sp =    149.1 : 1.0
        contains(israel) = True           talk.p : comp.w =    139.8 : 1.0
          contains(game) = True           rec.sp : sci.me =    134.7 : 

In [9]:
# Pretty good, baseline is 5% because we have 20 classes
print("Accuracy: ", float(cl.accuracy(testing_data)))

Accuracy:  0.7766666666666666


In [10]:
# Weird thing, it doesn't work well for short sentences.
# Maybe can't overcome prior because it was training on longer texts.
cl.classify('god christians jesus lord christian savior church')

'misc.forsale'

In [11]:
# We see it does better with the full text
for t in testing_data[:10]:
    print(t[0][:80])
    print("Predicted: {}, Actual: {}".format(cl.classify(t[0]), t[1]))

from : @ ( robert ) subject : re : sho and sc nntp-posting-host : organization :
Predicted: rec.autos, Actual: rec.autos
from : @ magnus.acs.ohio-state.edu ( kim richard man ) subject : syquest forsale
Predicted: misc.forsale, Actual: misc.forsale
from : @ casbah.acns.nwu.edu ( wilson ) subject : office package article-i.d . :
Predicted: comp.sys.mac.hardware, Actual: comp.os.ms-windows.misc
subject : re : do n't more innocents die without the death penalty ? from : bobb
Predicted: alt.atheism, Actual: alt.atheism
from : livesey @ solntze.wpd.sgi.com ( jon livesey ) subject : re : genocide is 
Predicted: alt.atheism, Actual: alt.atheism
from : @ ( david silver ) subject : re : fractal generation of clouds organizati
Predicted: comp.graphics, Actual: comp.graphics
subject : re : mike 's 1993 predictions from : gajarsky @ pilot.njin.net ( bob g
Predicted: rec.sport.baseball, Actual: rec.sport.baseball
from : jet @ ( j. eric ) subject : re : insurance and lotsa points ... in-reply-
Predic