In [47]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
import numpy as np
from sklearn.linear_model import SGDClassifier

### Here we'll be exploring the Newsgroup dataset. The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups.

###### It is a popular machine learning dataset, and is thus readily available. Load up the data for 4 types of newsgroups here

In [29]:
categories = ['alt.atheism', 
              'talk.religion.misc',
              'comp.graphics', 
              'sci.space']
data_train = fetch_20newsgroups(subset='train', categories=categories,
                              remove=('headers', 'footers', 'quotes')
                               )
data_test = fetch_20newsgroups(subset='test', categories=categories,
                              remove=('headers', 'footers', 'quotes'),
                              )

###### Create a training and testing dataset

In [30]:
X_train = data_train.data
y_train = data_train.target
cat = data_train.target_names

X_test = data_test.data
y_test = data_test.target

###### Vectorize and fit a Bernoulli Naive Bayes Classifier

In [42]:
cv_b = CountVectorizer(binary=True, stop_words='english')
cv_b.fit(X_train)
X_train_b = cv_b.transform(X_train)
X_test_b = cv_b.transform(X_test)

In [43]:
clf_b = BernoulliNB()

In [44]:
clf_b.fit(X_train_b,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

###### How does the classifier perform?

In [45]:
f1_score(y_train, clf_b.predict(X_train_b), average='weighted')

0.72720620165021255

In [46]:
f1_score(y_test, clf_b.predict(X_test_b), average='weighted')

0.5688258040546732

###### Let's try a multinomial model

In [36]:
cv_m = CountVectorizer(stop_words='english')
X_train_m = cv_m.fit_transform(X_train)
X_test_m = cv_m.transform(X_test)

In [37]:
clf_m = MultinomialNB().fit(X_train_m, y_train)

In [38]:
print f1_score(y_train, clf_m.predict(X_train_m), average='weighted')
print f1_score(y_test, clf_m.predict(X_test_m), average='weighted')


0.941948940172
0.777734750893


###### Ok, so we know multinomial seems to do better. Can we figure out how to improve it?

In [39]:
def showtop10(vectorizer, clf, categories):
    fn = np.asarray(vectorizer.get_feature_names())
    for i, cat in enumerate(categories):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print('%s: %s:' % (cat, ' '.join(fn[top10])))


In [40]:
showtop10(cv_m, clf_m, cat)

alt.atheism: like believe say atheism does just think don people god:
comp.graphics: software images files data use file jpeg edu graphics image:
sci.space: just shuttle time orbit data like earth launch nasa space:
talk.religion.misc: know say think christian just bible don jesus people god:


In [41]:
showtop10(cv_b, clf_b, cat)

alt.atheism: not you in it is and that of to the:
comp.graphics: on that it is for in of and to the:
sci.space: on for that it is in and of to the:
talk.religion.misc: are not it in that is and to of the:
