## DictVectorizer

In [5]:
# class DictVectorizer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin)
#  |  Transforms lists of feature-value mappings to vectors.
#  |  
#  |  This transformer turns lists of mappings (dict-like objects) of feature
#  |  names to feature values into Numpy arrays or scipy.sparse matrices for use
#  |  with scikit-learn estimators.

In [21]:
from sklearn.feature_extraction import DictVectorizer
vec = DictVectorizer(sparse=False, dtype=int) #Sparse will ensure that unwanted data which is irrelevant remains negated. 

data = [
           {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
           {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
           {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
           {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
       ]
# In a case like this, we have a dictionary and DictVectorizer helps transform this into machine understood (numbers) language
vec.fit_transform(data)

array([[     0,      1,      0, 850000,      4],
       [     1,      0,      0, 700000,      3],
       [     0,      0,      1, 650000,      3],
       [     1,      0,      0, 600000,      2]])

### Count Vectorizer

In [12]:
sample = ['good of good this is good',
          'evil of queen problem',
          'horizon problem problem of']

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
X = vec.fit_transform(sample)
X

<3x8 sparse matrix of type '<type 'numpy.int64'>'
	with 11 stored elements in Compressed Sparse Row format>

In [13]:
import pandas as pd
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())


Unnamed: 0,evil,good,horizon,is,of,problem,queen,this
0,0,3,0,1,1,0,0,1
1,1,0,0,0,1,1,1,0
2,0,0,1,0,1,2,0,0


# TF-IDF
Convert a collection of raw documents to a matrix of TF-IDF features.
  
Equivalent to CountVectorizer followed by TfidfTransformer.

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X = vec.fit_transform(sample)
pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

Unnamed: 0,evil,good,horizon,is,of,problem,queen,this
0,0.0,0.890524,0.0,0.296841,0.175319,0.0,0.0,0.296841
1,0.584483,0.0,0.0,0.0,0.345205,0.444514,0.584483,0.0
2,0.0,0.0,0.522535,0.0,0.308618,0.794803,0.0,0.0


In [25]:
help(TfidfVectorizer)

Help on class TfidfVectorizer in module sklearn.feature_extraction.text:

class TfidfVectorizer(CountVectorizer)
 |  Convert a collection of raw documents to a matrix of TF-IDF features.
 |  
 |  Equivalent to CountVectorizer followed by TfidfTransformer.
 |  
 |  Read more in the :ref:`User Guide <text_feature_extraction>`.
 |  
 |  Parameters
 |  ----------
 |  input : string {'filename', 'file', 'content'}
 |      If 'filename', the sequence passed as an argument to fit is
 |      expected to be a list of filenames that need reading to fetch
 |      the raw content to analyze.
 |  
 |      If 'file', the sequence items must have a 'read' method (file-like
 |      object) that is called to fetch the bytes in memory.
 |  
 |      Otherwise the input is expected to be the sequence strings or
 |      bytes items are expected to be analyzed directly.
 |  
 |  encoding : string, 'utf-8' by default.
 |      If bytes or files are given to analyze, this encoding is used to
 |      decode.
 |