In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
categories = ['alt.atheism', 'soc.religion.christian',
              'comp.graphics', 'sci.med']


In [3]:
twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories, shuffle=True, random_state=42)

In [4]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [5]:
len(twenty_train.data),len(twenty_train.filenames)

(2257, 2257)

In [None]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [None]:
print("\n".join(twenty_train.data[50].split("\n")))

From: ab@nova.cc.purdue.edu (Allen B)
Subject: Re: TIFF: philosophical significance of 42
Organization: Purdue University
Lines: 39

In article <prestonm.735400848@cs.man.ac.uk> prestonm@cs.man.ac.uk (Martin  
Preston) writes:
> Why not use the PD C library for reading/writing TIFF files? It took me a
> good 20 minutes to start using them in your own app.

I certainly do use it whenever I have to do TIFF, and it usually works
very well.  That's not my point.  I'm >philosophically< opposed to it
because of its complexity.

This complexity has led to some programs' poor TIFF writers making
some very bizarre files, other programs' inability to load TIFF
images (though they'll save them, of course), and a general
inability to interchange images between different environments
despite the fact they all think they understand TIFF.

As the saying goes, "It's not me I'm worried about- it's all the
>other<  assholes out there!"  I've had big trouble with misuse and
abuse of TIFF over the years, 

In [None]:
print(twenty_train.target_names[twenty_train.target[50]])

comp.graphics


In [None]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2])

In [None]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


Tokenizing text with scikit-learn

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
X_train_counts.shape

(2257, 35788)

In [None]:
print(X_train_counts[0])

In [None]:
count_vect.vocabulary_.get(u'algorithm')

4690

From occurrences to frequencies

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [15]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [17]:
print(X_train_tf[0])

  (0, 177)	0.15075567228888181
  (0, 230)	0.07537783614444091
  (0, 587)	0.07537783614444091
  (0, 2326)	0.15075567228888181
  (0, 3062)	0.07537783614444091
  (0, 3166)	0.07537783614444091
  (0, 4017)	0.15075567228888181
  (0, 4378)	0.07537783614444091
  (0, 4808)	0.07537783614444091
  (0, 5195)	0.07537783614444091
  (0, 5201)	0.07537783614444091
  (0, 5285)	0.07537783614444091
  (0, 8696)	0.30151134457776363
  (0, 9031)	0.22613350843332272
  (0, 9338)	0.07537783614444091
  (0, 9801)	0.07537783614444091
  (0, 9805)	0.15075567228888181
  (0, 9932)	0.07537783614444091
  (0, 12014)	0.07537783614444091
  (0, 12051)	0.07537783614444091
  (0, 12541)	0.07537783614444091
  (0, 12833)	0.15075567228888181
  (0, 14085)	0.07537783614444091
  (0, 14281)	0.15075567228888181
  (0, 14676)	0.07537783614444091
  :	:
  (0, 24677)	0.07537783614444091
  (0, 25337)	0.07537783614444091
  (0, 25361)	0.07537783614444091
  (0, 25663)	0.07537783614444091
  (0, 26175)	0.07537783614444091
  (0, 27836)	0.0753778361

Training a classifier

In [18]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

To try to predict the outcome on a new document we need to extract the features using almost the same feature extracting chain as before. The difference is that we call transform instead of fit_transform on the transformers, since they have already been fit to the training set:

In [19]:
docs_new = ['God is love', 'OpenGL on the GPU is fast', 'Please email any response', 'philosophical significance of 42']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics
'Please email any response' => comp.graphics
'philosophical significance of 42' => comp.graphics


Building a pipeline

In [20]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [21]:
text_clf.fit(twenty_train.data, twenty_train.target)

Evaluation of the performance on the test set

In [None]:
import numpy as np
twenty_test = fetch_20newsgroups(subset='test',
    categories=categories, shuffle=True, random_state=42)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.8348868175765646

In [None]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])

text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)

0.9101198402130493

In [None]:
from sklearn import metrics
print(metrics.classification_report(twenty_test.target, predicted,
    target_names=twenty_test.target_names))




                        precision    recall  f1-score   support

           alt.atheism       0.95      0.80      0.87       319
         comp.graphics       0.87      0.98      0.92       389
               sci.med       0.94      0.89      0.91       396
soc.religion.christian       0.90      0.95      0.93       398

              accuracy                           0.91      1502
             macro avg       0.91      0.91      0.91      1502
          weighted avg       0.91      0.91      0.91      1502



In [None]:
metrics.confusion_matrix(twenty_test.target, predicted)

array([[256,  11,  16,  36],
       [  4, 380,   3,   2],
       [  5,  35, 353,   3],
       [  5,  11,   4, 378]])

Parameter tuning using grid search

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [None]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=-1)

In [None]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [None]:
gs_clf.predict(['God is love'])[0]

3

In [None]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

The object’s best_score_ and best_params_ attributes store the best mean score and the parameters setting corresponding to that score:

In [None]:
gs_clf.best_score_
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))

clf__alpha: 0.001
tfidf__use_idf: True
vect__ngram_range: (1, 1)
