In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
simple_train = ['call you tonight', 'Call me a cab', 'please call me.. please']

In [3]:
# instantiate CountVectorizer (vectorizer)
# TfidfVectorizer() is better than CountVectorizer() 
# CountVectorizer()  is almost same as TfidfVectorizer(),CountVectorizer() is having less feature than TfidfVectorizer()
vect = CountVectorizer()

<h3>Learn the 'vocabulary' of the training data (occurs in-place)</h3>

In [4]:
vect.fit(simple_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [5]:
simple_train

['call you tonight', 'Call me a cab', 'please call me.. please']

In [6]:
# examine the fitted vocabulary
#get uniuqe name
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

<h3>Transform training data into a 'document-term matrix'</h3>

In [7]:
simple_train_dtm = vect.transform(simple_train)
print(simple_train)
print(vect.get_feature_names())
simple_train_dtm.toarray()

['call you tonight', 'Call me a cab', 'please call me.. please']
['cab', 'call', 'me', 'please', 'tonight', 'you']


array([[0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]])

<h3>Convert sparse matrix to a dense matrix</h3>

In [8]:
simple_train = ['call call please please','call you tonight', 'Call me a cab', 'please call me.. please']
vect = CountVectorizer()
vect.fit(simple_train)
simple_train_dtm = vect.transform(simple_train)
simple_train

['call call please please',
 'call you tonight',
 'Call me a cab',
 'please call me.. please']

In [9]:
vect.get_feature_names()

['cab', 'call', 'me', 'please', 'tonight', 'you']

In [17]:
simple_train_dtm.toarray()

array([[0, 2, 0, 2, 0, 0],
       [0, 1, 0, 0, 1, 1],
       [1, 1, 1, 0, 0, 0],
       [0, 1, 1, 2, 0, 0]])

<h3>Examine the vocabulary and document-term matrix together</h3>

In [18]:
# pd.DataFrame(matrix, columns=columns)
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names())

Unnamed: 0,cab,call,me,please,tonight,you
0,0,2,0,2,0,0
1,0,1,0,0,1,1
2,1,1,1,0,0,0
3,0,1,1,2,0,0
