In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
import numpy as np
from sklearn.linear_model import SGDClassifier

### Here we'll be exploring the Newsgroup dataset. The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups.

###### It is a popular machine learning dataset, and is thus readily available. Load up the data for 4 types of newsgroups here

In [2]:
categories = ['alt.atheism', 
              'talk.religion.misc',
              'comp.graphics', 
              'sci.space']
data_train = fetch_20newsgroups(subset='train', categories=categories,
                              #  remove=('headers', 'footers', 'quotes')
                               )
data_test = fetch_20newsgroups(subset='test', categories=categories,
                              #remove=('headers', 'footers', 'quotes'),
                              )

###### Create a training and testing dataset

In [357]:
X_train = data_train.data
y_train = data_train.target
cat = data_train.target_names

X_test = data_test.data
y_test = data_test.target

###### Vectorize and fit a Bernoulli Naive Bayes Classifier

In [362]:
cv_b = CountVectorizer(binary=True)
cv_b.fit(X_train)
X_train_b = cv_b.transform(X_train)
X_test_b = cv_b.transform(X_test)

In [363]:
clf_b = BernoulliNB()

In [379]:
clf_b.fit(X_train_b,y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

###### How does the classifier perform?

In [365]:
f1_score(clf_b.predict(X_train_b), y_train, average='weighted')

0.89432415346957739

In [366]:
f1_score(clf_b.predict(X_test_b), y_test, average='weighted')

0.77972936148164518

###### Let's try a multinomial model

In [373]:
cv_m = CountVectorizer()
X_train_m = cv_m.fit_transform(X_train)
X_test_m = cv_m.transform(X_test)

In [374]:
clf_m = MultinomialNB().fit(X_train_m, y_train)

In [375]:
print f1_score(clf_m.predict(X_train_m), y_train, average='weighted')
print f1_score(clf_m.predict(X_test_m), y_test, average='weighted')


0.987240083811
0.899565202862


###### Ok, so we know multinomial seems to do better. Can we figure out how to improve it?

In [376]:
def showtop10(vectorizer, clf, categories):
    fn = np.asarray(vectorizer.get_feature_names())
    for i, cat in enumerate(categories):
        top10 = np.argsort(clf.coef_[i])[-10:]
        print('%s: %s:' % (cat, ' '.join(fn[top10])))


In [377]:
showtop10(cv_m, clf_m, cat)

alt.atheism: not you it in and that is to of the:
comp.graphics: you from it for in is and of to the:
sci.space: space it for that is in and to of the:
talk.religion.misc: not it you is in that and to of the:


In [378]:
showtop10(cv_b, clf_b, cat)

alt.atheism: and to organization in of re the subject lines from:
comp.graphics: for in and of to the organization subject from lines:
sci.space: is and in of to the organization lines subject from:
talk.religion.misc: is and in to of organization the subject lines from:
