In [2]:
# http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
vectorizer = CountVectorizer(min_df=1)

In [5]:
vectorizer

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
corpus = [
'This is the first document.',
'This is the second second document.',
'And the third one.',
'Is this the first document?']

In [8]:
X = vectorizer.fit_transform(corpus)

In [14]:
print vectorizer.get_feature_names()
X.toarray()

[u'and', u'document', u'first', u'is', u'one', u'second', u'the', u'third', u'this']


array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

In [15]:
# mapping from feature name to column index is stored in the vocabulary_ attribute of the vectorizer
vectorizer.vocabulary_.get('and')

0

In [16]:
# words that were not seen in the training corpus will be completely ignored in future calls to the transform method
vectorizer.transform(['something completely new']).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [17]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)

In [18]:
analyze = bigram_vectorizer.build_analyzer()
analyze('Bi-grams are cool!') == (['bi', 'grams', 'are', 'cool', 'bi grams', 'grams are', 'are cool'])

True

In [20]:
# vocabulary extracted by this vectorizer is much bigger and can now resolve ambiguities encoded in local positioning patterns
X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
X_2

array([[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0],
       [0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0],
       [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1]])

In [22]:
# feature index (column) for "is this"
feature_index = bigram_vectorizer.vocabulary_.get('is this')
X_2[:, feature_index]

array([0, 0, 0, 1])

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer

In [24]:
transformer = TfidfTransformer()
transformer

TfidfTransformer(norm=u'l2', smooth_idf=True, sublinear_tf=False,
         use_idf=True)

In [25]:
counts = [[3, 0, 1],[2, 0, 0],[3, 0, 0],[4, 0, 0],[3, 2, 0],[3, 0, 2]]

In [27]:
tfidf = transformer.fit_transform(counts)
tfidf

<6x3 sparse matrix of type '<type 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [28]:
tfidf.toarray()

array([[ 0.85151335,  0.        ,  0.52433293],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.55422893,  0.83236428,  0.        ],
       [ 0.63035731,  0.        ,  0.77630514]])

In [29]:
transformer.idf_

array([ 1.        ,  2.25276297,  1.84729786])

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
vectorizer = TfidfVectorizer(min_df=1)

In [32]:
vectorizer.fit_transform(corpus)

<4x9 sparse matrix of type '<type 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>