In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# Text Feature Extraction

### Bag of words

* Tokenizing: Splitting strings into tokens
* Counting: Count number of occurences of each token in a document
* Normalizing: Normalizing and weighting tokens

Each token is treated as a feature

Documents can be treated as a matrix s.t.:

* Each column is a feature
* Each row is a document

Vectorization: Process of turning text documents into numeric feature vectors.

Bag of Words: The process of treating documents as token counts and normalizing while ignoring relative position of token in document.

### Sparsity

Sparse representations, such as scipy.sparse, is used to store these matrices in memory. This is because most of the features are not used. Only a small subset of words from the english dictionary will be in each document.

### Count Vectorizer

Implements both tokenization and counting

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

corpus = [
         'This is the first document.',
         'This is the second second document.',
         'And the third one.',
         'Is this the first document?'
          ]

X = vectorizer.fit_transform(corpus)
X

<4x9 sparse matrix of type '<type 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

You can access the feature names, word tokens, using the get_feature_names method of vectorizer

In [3]:
vectorizer.get_feature_names()

[u'and',
 u'document',
 u'first',
 u'is',
 u'one',
 u'second',
 u'the',
 u'third',
 u'this']

You can get the matrix of documents and features by using the toarray() method

In [4]:
X.toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 2, 1, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 1, 0],
       [0, 1, 1, 1, 0, 0, 1, 0, 1]])

You can get the column index for a feature name via the vocabulary attribute

In [5]:
print(vectorizer.vocabulary_.get('this'))
print(vectorizer.vocabulary_.get('is'))

8
3


You can transform new words using transform. However, previously unseen words will be ignored

In [6]:
print(vectorizer.transform(['Something completely new']).toarray())
print(vectorizer.transform(['This is not completely new']).toarray())

[[0 0 0 0 0 0 0 0 0]]
[[0 0 0 1 0 0 0 0 1]]


It's important to note that word ordering doesn't matter in these bag of word encodings

In [7]:
print(vectorizer.transform(['Is this the first document?']).toarray())
print(vectorizer.transform(['This is the first document']).toarray())

[[0 1 1 1 0 0 1 0 1]]
[[0 1 1 1 0 0 1 0 1]]


### N-grams

Represents groups of n ordered words

In [8]:
bigram_vectorizer = CountVectorizer(ngram_range=(2,2),min_df=1)
bigram_vectorizer.fit_transform(corpus).toarray()

array([[0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1]])

In [9]:
bigram_vectorizer.get_feature_names()

[u'and the',
 u'first document',
 u'is the',
 u'is this',
 u'second document',
 u'second second',
 u'the first',
 u'the second',
 u'the third',
 u'third one',
 u'this is',
 u'this the']

In [10]:
vectorizer.get_feature_names()

[u'and',
 u'document',
 u'first',
 u'is',
 u'one',
 u'second',
 u'the',
 u'third',
 u'this']

You can do this with as many n-grams as you'd like. Just use the ngram_range parameter to put in the range of n-grams you'd like.

In [11]:
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),min_df=1)
print(bigram_vectorizer.fit_transform(corpus).toarray())
bigram_vectorizer.get_feature_names()

[[0 0 1 1 1 1 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0]
 [0 0 1 0 0 1 1 0 0 2 1 1 1 0 1 0 0 0 1 1 0]
 [1 1 0 0 0 0 0 0 1 0 0 0 1 0 0 1 1 1 0 0 0]
 [0 0 1 1 1 1 0 1 0 0 0 0 1 1 0 0 0 0 1 0 1]]


[u'and',
 u'and the',
 u'document',
 u'first',
 u'first document',
 u'is',
 u'is the',
 u'is this',
 u'one',
 u'second',
 u'second document',
 u'second second',
 u'the',
 u'the first',
 u'the second',
 u'the third',
 u'third',
 u'third one',
 u'this',
 u'this is',
 u'this the']

# TF-IDF

Allows you to weight words based on how frequently they occur in your document and how frequently they occus in all documents.

This was popular in information retrieval, search engines. It has also become popular in text clustering and classification.

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer

TfidfTransformer(norm=u'l2', smooth_idf=True, sublinear_tf=False,
         use_idf=True)

In [13]:
counts = [[3, 0, 1],
          [2, 0, 0],
          [3, 0, 0],
          [4, 0, 0],
          [3, 2, 0],
          [3, 0, 2]]

tfidf = transformer.fit_transform(counts)
tfidf

<6x3 sparse matrix of type '<type 'numpy.float64'>'
	with 9 stored elements in Compressed Sparse Row format>

In [14]:
tfidf.toarray()

array([[ 0.85151335,  0.        ,  0.52433293],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 1.        ,  0.        ,  0.        ],
       [ 0.55422893,  0.83236428,  0.        ],
       [ 0.63035731,  0.        ,  0.77630514]])

In [15]:
onegram_vectorizer = CountVectorizer(min_df=1)
counts = onegram_vectorizer.fit_transform(corpus).toarray()
print(counts)

[[0 1 1 1 0 0 1 0 1]
 [0 1 0 1 0 2 1 0 1]
 [1 0 0 0 1 0 1 1 0]
 [0 1 1 1 0 0 1 0 1]]


In [16]:
tfidf = transformer.fit_transform(counts)
tfidf

<4x9 sparse matrix of type '<type 'numpy.float64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [17]:
tfidf.toarray()

array([[ 0.        ,  0.43877674,  0.54197657,  0.43877674,  0.        ,
         0.        ,  0.35872874,  0.        ,  0.43877674],
       [ 0.        ,  0.27230147,  0.        ,  0.27230147,  0.        ,
         0.85322574,  0.22262429,  0.        ,  0.27230147],
       [ 0.55280532,  0.        ,  0.        ,  0.        ,  0.55280532,
         0.        ,  0.28847675,  0.55280532,  0.        ],
       [ 0.        ,  0.43877674,  0.54197657,  0.43877674,  0.        ,
         0.        ,  0.35872874,  0.        ,  0.43877674]])

We can combine all steps by using the TfidfVectorizer

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit_transform(corpus).toarray()

array([[ 0.        ,  0.43877674,  0.54197657,  0.43877674,  0.        ,
         0.        ,  0.35872874,  0.        ,  0.43877674],
       [ 0.        ,  0.27230147,  0.        ,  0.27230147,  0.        ,
         0.85322574,  0.22262429,  0.        ,  0.27230147],
       [ 0.55280532,  0.        ,  0.        ,  0.        ,  0.55280532,
         0.        ,  0.28847675,  0.55280532,  0.        ],
       [ 0.        ,  0.43877674,  0.54197657,  0.43877674,  0.        ,
         0.        ,  0.35872874,  0.        ,  0.43877674]])

You can use the binary parameter to turn features into binary parameters. This can be useful when using short documents are certain algorithms. 

In [19]:
onegram_vectorizer = CountVectorizer(binary = True, min_df=1)
counts = onegram_vectorizer.fit_transform(['one two two']).toarray()
print(counts)

onegram_vectorizer = CountVectorizer(min_df=1)
counts = onegram_vectorizer.fit_transform(['one two two']).toarray()
print(counts)

[[1 1]]
[[1 2]]


You can use CountVectorizer's parameter analyzer='char_wb' to do teh counts on the individual characters.

In [20]:
ngram_vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 2), min_df=1)
counts = ngram_vectorizer.fit_transform(['words', 'wprds'])
counts.toarray()

array([[1, 1, 1, 0, 1, 1, 1, 0],
       [1, 1, 0, 1, 1, 1, 0, 1]])

In [21]:
ngram_vectorizer.get_feature_names()

[u' w', u'ds', u'or', u'pr', u'rd', u's ', u'wo', u'wp']

# Yelp Reviews

#### Grab only 5 and 1 star reviews

#### Split into training and test set

#### Create a count vectorizer

#### Fit a Naive Bayes Classifier

#### How well did it do?

0.91878669275929548

#### What did the confusion matrix look like?

#### Redo with stop words

#### Try again with different min_df

#### Let's do it with TF-IDF

#### Fit Naive Bayes

#### Pipeline Logistic Regression

0.92172211350293543

#### What does KNN do?