In [76]:
from __future__ import print_function

import logging
import numpy as np
from optparse import OptionParser
import sys
from time import time
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.extmath import density
from sklearn import metrics

In [22]:
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [23]:
op = OptionParser()
op.add_option("--report",
              action="store_true", dest="print_report",
              help="Print a detailed classification report.")
op.add_option("--chi2_select",
              action="store", type="int", dest="select_chi2",
              help="Select some number of features using a chi-squared test")
op.add_option("--confusion_matrix",
              action="store_true", dest="print_cm",
              help="Print the confusion matrix.")
op.add_option("--top10",
              action="store_true", dest="print_top10",
              help="Print ten most discriminative terms per class"
                   " for every classifier.")
op.add_option("--all_categories",
              action="store_true", dest="all_categories",
              help="Whether to use all categories or not.")
op.add_option("--use_hashing",
              action="store_true",
              help="Use a hashing vectorizer.")
op.add_option("--n_features",
              action="store", type=int, default=2 ** 16,
              help="n_features when using the hashing vectorizer.")
op.add_option("--filtered",
              action="store_true",
              help="Remove newsgroup information that is easily overfit: "
                   "headers, signatures, and quoting.")

<Option at 0x7fb53ffbfb70: --filtered>

In [4]:
def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')

# work-around for Jupyter notebook and IPython console
argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

print(__doc__)
op.print_help()
print()

Automatically created module for IPython interactive environment
Usage: ipykernel_launcher.py [options]

Options:
  -h, --help            show this help message and exit
  --report              Print a detailed classification report.
  --chi2_select=SELECT_CHI2
                        Select some number of features using a chi-squared
                        test
  --confusion_matrix    Print the confusion matrix.
  --top10               Print ten most discriminative terms per class for
                        every classifier.
  --all_categories      Whether to use all categories or not.
  --use_hashing         Use a hashing vectorizer.
  --n_features=N_FEATURES
                        n_features when using the hashing vectorizer.
  --filtered            Remove newsgroup information that is easily overfit:
                        headers, signatures, and quoting.



In [5]:
if opts.all_categories:
    categories = None
else:
    categories = [
        'alt.atheism',
        'talk.religion.misc',
        'comp.graphics',
        'sci.space',
    ]

if opts.filtered:
    remove = ('headers', 'footers', 'quotes')
else:
    remove = ()


In [6]:
print("Loading 20 newsgroups dataset for categories:")
print(categories if categories else "all")

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']


In [24]:
categories

['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

In [26]:
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=remove)

In [44]:
data_train.target_names[0]

'alt.atheism'

In [8]:
data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=remove)

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data_train.data)
X_train_counts.shape

(2034, 34118)

In [64]:
count_vect.get_feature_names()

['00',
 '000',
 '0000',
 '00000',
 '000000',
 '000005102000',
 '000021',
 '000062david42',
 '0000vec',
 '0001',
 '000100255pixel',
 '000406',
 '00041032',
 '0004136',
 '0004246',
 '0004422',
 '00044513',
 '0004847546',
 '0005',
 '0007',
 '00090711',
 '000usd',
 '0010580b',
 '001125',
 '0012',
 '001200201pixel',
 '001428',
 '001555',
 '001718',
 '001757',
 '0018',
 '00196',
 '002',
 '0020',
 '0022',
 '0028',
 '0029',
 '0033',
 '0034',
 '003719',
 '0038',
 '0039',
 '0049',
 '006',
 '0065',
 '0094',
 '0098',
 '00index',
 '00pm',
 '01',
 '0100',
 '010116',
 '010326',
 '010329',
 '010734',
 '010821',
 '011033',
 '011255',
 '011605',
 '011634edt',
 '011720',
 '012536',
 '012537',
 '013034',
 '013423tan102',
 '013657',
 '013846',
 '014305',
 '014506',
 '01463',
 '0150',
 '015931',
 '01609',
 '01742',
 '01752',
 '01776',
 '0179',
 '01821',
 '01826',
 '0184',
 '01852',
 '01854',
 '01890',
 '018b',
 '0192',
 '0195',
 '0199',
 '01a',
 '01wb',
 '02',
 '020',
 '0200',
 '020021',
 '020259',
 '020359

In [72]:
np.sort(np.array(count_vect.get_feature_names())[np.nonzero(X_train_counts[0, :])[1]])

array(['0150', '21', '31', '3426', '3ds', '44', '650', '667', 'able',
       'about', 'ac', 'after', 'all', 'and', 'anyone', 'are', 'available',
       'be', 'but', 'carefully', 'cel', 'default', 'dept', 'did', 'does',
       'ed', 'edinburgh', 'email', 'environment', 'explicitly', 'fax',
       'festival', 'file', 'for', 'format', 'from', 'given', 'go', 'have',
       'hawkes', 'hi', 'if', 'in', 'information', 'is', 'it', 'know',
       'laboratory', 'like', 'lines', 'manual', 'mapping', 'model', 'not',
       'nothing', 'noticed', 'of', 'only', 'orientation', 'planes',
       'position', 'positioned', 'positions', 'preserved', 'prj',
       'psychology', 'read', 'reload', 'restarting', 'rule', 'rules',
       'rych', 'rycharde', 'said', 'save', 'saving', 'somewhere', 'stored',
       'subject', 'tel', 'texture', 'that', 'the', 'their', 'they', 'this',
       'to', 'uk', 'univ', 've', 'virtual', 'when', 'where', 'why', 'with',
       'you', 'your'],
      dtype='<U80')

- all words are rendered to lower case
- no stemmer used


In [70]:
data_train.data[0]



In [77]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2034, 34118)

In [95]:
np.array(count_vect.get_feature_names())[np.argsort(X_train_tfidf[0, :])]

array(['00'],
      dtype='<U80')

In [93]:
X_train_tfidf[0, :].argsort()

AttributeError: argsort not found

In [90]:
X_train_tfidf[0, :]

<1x34118 sparse matrix of type '<class 'numpy.float64'>'
	with 97 stored elements in Compressed Sparse Row format>

In [87]:
np.argmax(X_train_tfidf[0, :])

2362

In [9]:
target_names = data_train.target_names

In [10]:
target_names

['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']

In [11]:
def size_mb(docs):
    return sum(len(s.encode('utf-8')) for s in docs) / 1e6

In [12]:
data_train_size_mb = size_mb(data_train.data)
data_test_size_mb = size_mb(data_test.data)

In [13]:
print("%d documents - %0.3fMB (training set)" % (
    len(data_train.data), data_train_size_mb))
print("%d documents - %0.3fMB (test set)" % (
    len(data_test.data), data_test_size_mb))
print("%d categories" % len(categories))
print()

2034 documents - 3.980MB (training set)
1353 documents - 2.867MB (test set)
4 categories



In [14]:
y_train, y_test = data_train.target, data_test.target

In [15]:
data_train.data[0]



In [16]:
print("Extracting features from the training data using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    vectorizer = HashingVectorizer(stop_words='english', alternate_sign=False,
                                   n_features=opts.n_features)
    X_train = vectorizer.transform(data_train.data)
else:
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(data_train.data)
duration = time() - t0

Extracting features from the training data using a sparse vectorizer


Tf-idf means simply term frequency/document frequency and counts 

In [17]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
X_train = vectorizer.fit_transform(data_train.data)

In [18]:
x_test = vectorizer.transform(data_test.data)

In [19]:
x_test.data

array([ 0.15767991,  0.17519565,  0.18043963, ...,  0.13922915,
        0.17731702,  0.09762484])

In [20]:
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()
X_test = vectorizer.transform(data_test.data)
duration = time() - t0
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
print("n_samples: %d, n_features: %d" % X_test.shape)
print()

done in 0.785387s at 5.067MB/s
n_samples: 2034, n_features: 33809

Extracting features from the test data using the same vectorizer
done in 0.392156s at 7.312MB/s
n_samples: 1353, n_features: 33809

