Goal is to use scikit-learn tools to analyze a collection of text documents on twenty different topics

    1.load the file contents and categories
    2.extract feature vectors suitable for machine learning
    3.train a linear model to perform categorization
    4.use a grid search strategy to find a good configuration of both the feature extraction components and the classifier

In [1]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [3]:
from sklearn.datasets import fetch_20newsgroups

In [4]:
twenty_train = fetch_20newsgroups(subset='train', 
                                 categories=categories,
                                 shuffle=True,
                                 random_state=42)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [5]:
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [6]:
len(twenty_train.data)

2257

In [7]:
len(twenty_train.target)

2257

In [8]:
len(twenty_train.filenames)

2257

In [9]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [10]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [11]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [12]:
for t in twenty_train.target[:10]:
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


Extracting Features from Text Files

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
count_vect =CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)

In [15]:
X_train_counts.shape

(2257, 35788)

In [16]:
count_vect.vocabulary_.get(u'algorithm')

4690

In [17]:
from sklearn.feature_extraction.text import TfidfTransformer

In [18]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [19]:
tfidf_transformer = TfidfTransformer()
X_train_tfitf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfitf.shape

(2257, 35788)

Training a Classifier

In [20]:
from sklearn.naive_bayes import MultinomialNB

In [21]:
clf = MultinomialNB().fit(X_train_tfitf, twenty_train.target)

In [22]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)
for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))
    

'God is love' => soc.religion.christian
'OpenGL on the GPU is fast' => comp.graphics


Building a Pipeline

In [23]:
from sklearn.pipeline import Pipeline

In [24]:
text_clf  = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     ])

In [25]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

Evaluation of the performance on the test set

In [26]:
import numpy as np

In [27]:
twenty_train = fetch_20newsgroups(subset='test',
                                 categories=categories,
                                 shuffle=True,
                                 random_state=42)

In [28]:
docs_test = twenty_train.data

In [29]:
predicted = text_clf.predict(docs_test)

In [30]:
predicted

array([2, 2, 3, ..., 2, 2, 1], dtype=int64)

In [31]:
np.mean(predicted == twenty_train.target)

0.83488681757656458

In [34]:
from sklearn.linear_model import SGDClassifier

In [37]:
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge',
                                          alpha=1e-3, random_state=42,
                                          max_iter=5, tol=None))
                    ])

In [38]:
text_clf.fit(twenty_train.data, twenty_train.target)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [39]:
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_train.target)

0.99733688415446076

In [40]:
from sklearn import metrics

In [41]:
print(metrics.classification_report(twenty_train.target, predicted, target_names = twenty_train.target_names))

                        precision    recall  f1-score   support

           alt.atheism       1.00      0.99      1.00       319
         comp.graphics       1.00      1.00      1.00       389
               sci.med       1.00      1.00      1.00       396
soc.religion.christian       0.99      1.00      0.99       398

           avg / total       1.00      1.00      1.00      1502



In [42]:
metrics.confusion_matrix(twenty_train.target, predicted)

array([[316,   0,   0,   3],
       [  0, 389,   0,   0],
       [  0,   0, 396,   0],
       [  0,   1,   0, 397]], dtype=int64)

Parameter tuning using grid search

In [49]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-2, 1e-3),
}

In [50]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)


In [51]:
gs_clf = gs_clf.fit(twenty_train.data[:400], twenty_train.target[:400])

In [52]:
twenty_train.target_names[gs_clf.predict(['God is love'])[0]]

'soc.religion.christian'

In [53]:
gs_clf.best_score_

0.90749999999999997

In [54]:
for param_name in sorted(parameters.keys()):
    print('%s : %r' % (param_name, gs_clf.best_params_[param_name]))

clf__alpha : 0.001
tfidf__use_idf : True
vect__ngram_range : (1, 1)


In [55]:
gs_clf.cv_results_



{'mean_fit_time': array([ 0.19760036,  0.77480133,  0.38480067,  0.7800014 ,  0.4732008 ,
         0.86840161,  0.45240092,  0.88400157]),
 'mean_score_time': array([ 0.05720011,  0.1560003 ,  0.05720011,  0.15080023,  0.13000035,
         0.18200024,  0.08840005,  0.14040017]),
 'mean_test_score': array([ 0.8575,  0.845 ,  0.7325,  0.755 ,  0.9075,  0.89  ,  0.8   ,
         0.805 ]),
 'mean_train_score': array([ 0.99624992,  1.        ,  0.94757391,  0.97748543,  1.        ,
         1.        ,  0.9925046 ,  1.        ]),
 'param_clf__alpha': masked_array(data = [0.01 0.01 0.01 0.01 0.001 0.001 0.001 0.001],
              mask = [False False False False False False False False],
        fill_value = ?),
 'param_tfidf__use_idf': masked_array(data = [True True False False True True False False],
              mask = [False False False False False False False False],
        fill_value = ?),
 'param_vect__ngram_range': masked_array(data = [(1, 1) (1, 2) (1, 1) (1, 2) (1, 1) (1, 2) (1, 