In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer

import numpy as np

In [2]:
#Create our vectorizer
vectorizer = CountVectorizer()

#let's fetch all the possible text data
newsgroups_data = fetch_20newsgroups()
newsgroups_data.data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [4]:
vectorizer.fit(newsgroups_data.data)

In [5]:
counter = 0
n = 10
for word, index in vectorizer.vocabulary_.items():
    print(word, index)
    counter += 1
    if counter > n:
        break

from 56979
lerxst 75358
wam 123162
umd 118280
edu 50527
where 124031
my 85354
thing 114688
subject 111322
what 123984
car 37780


In [6]:
a = vectorizer.transform([newsgroups_data.data[0]]).toarray()[0]
a

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [7]:
len(vectorizer.vocabulary_)

130107

In [8]:
newsgroups_data_cleaned = fetch_20newsgroups(remove=('headers','footers','quotes'))
print(newsgroups_data_cleaned.data[0])

I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [9]:
vectorizer_cleaned = vectorizer.fit(newsgroups_data_cleaned.data)
len(vectorizer_cleaned.vocabulary_)

101631

So, we got rid of more than 30000 words, but with more than a 100000 words is it still very large

In [10]:
newgroups_train = fetch_20newsgroups(subset='train',
                                    remove=('headers','footers','quotes'))
newgroups_test = fetch_20newsgroups(subset='test',
                                   remove =('headers','footers','quotes'))

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [12]:
vectorizer = CountVectorizer()
train_data = vectorizer.fit_transform(newgroups_train.data)

classifier = MultinomialNB(alpha=.01)
classifier.fit(train_data, newgroups_train.target)

test_data = vectorizer.transform(newgroups_test.data)

predictions = classifier.predict(test_data)

accuracy_score = metrics.accuracy_score(newgroups_test.target, predictions)
f1_score = metrics.f1_score(newgroups_test.target,
                           predictions,
                           average='macro')

print("Accuracy Score: ", accuracy_score)
print("F1 score: ", f1_score)

Accuracy Score:  0.6460435475305364
F1 score:  0.6203806145034193
