In [47]:
text = ['This is a line',
        'This is another line',
        'Completely different line']

In [48]:
from sklearn.feature_extraction.text import CountVectorizer

In [49]:
cv = CountVectorizer()
sparse_matrix = cv.fit_transform(text)

In [50]:
sparse_matrix.todense()

matrix([[0, 0, 0, 1, 1, 1],
        [1, 0, 0, 1, 1, 1],
        [0, 1, 1, 0, 1, 0]])

In [53]:
cv.vocabulary_

{'this': 5, 'is': 3, 'line': 4, 'another': 0, 'completely': 1, 'different': 2}

## Adding **stop words**

In [62]:
import pandas as pd

In [54]:
cv = CountVectorizer(stop_words='english')

In [58]:
res = cv.fit_transform(text)

In [79]:
res.todense()

matrix([[0, 0, 1],
        [0, 0, 1],
        [1, 1, 1]])

In [73]:
cv.vocabulary_

{'line': 2, 'completely': 0, 'different': 1}

In [76]:
ordered_words = {key: v for key, v in sorted(cv.vocabulary_.items(), key=lambda x: x[1])}
ordered_words

{'completely': 0, 'different': 1, 'line': 2}

In [78]:
pd.DataFrame(data=res.todense(), columns=ordered_words)

Unnamed: 0,completely,different,line
0,0,0,1
1,0,0,1
2,1,1,1


## **TF-IDF Transformer** 
### **(Term Frequency - Inverse Document Frequency)** 

In [23]:
from sklearn.feature_extraction.text import TfidfTransformer

In [82]:
text = ['This is a line',
        'This is another line',
        'Completely different line']

In [84]:
cv = CountVectorizer()

In [85]:
sparse_matrix = cv.fit_transform(text)
sparse_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 10 stored elements and shape (3, 6)>

In [86]:
tfidf = TfidfTransformer()

In [87]:
res = tfidf.fit_transform(sparse_matrix) # Bag of Words to Term Frequency - Inverse Document Frequency

In [88]:
res.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])

## **TF-IDF Vectorizer**

In [95]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [96]:
tv = TfidfVectorizer()

In [97]:
tv_res = tv.fit_transform(text)
tv_res

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10 stored elements and shape (3, 6)>

In [98]:
tv_res.todense()

matrix([[0.        , 0.        , 0.        , 0.61980538, 0.48133417,
         0.61980538],
        [0.63174505, 0.        , 0.        , 0.4804584 , 0.37311881,
         0.4804584 ],
        [0.        , 0.65249088, 0.65249088, 0.        , 0.38537163,
         0.        ]])

In [104]:
tv.vocabulary_

{'this': 5, 'is': 3, 'line': 4, 'another': 0, 'completely': 1, 'different': 2}

In [102]:
ordered_tv_words = {key: v for key, v in sorted(tv.vocabulary_.items(), key=lambda x: x[1])}
ordered_tv_words 

{'another': 0, 'completely': 1, 'different': 2, 'is': 3, 'line': 4, 'this': 5}

In [103]:
pd.DataFrame(data=tv_res.todense(), columns=ordered_tv_words)

Unnamed: 0,another,completely,different,is,line,this
0,0.0,0.0,0.0,0.619805,0.481334,0.619805
1,0.631745,0.0,0.0,0.480458,0.373119,0.480458
2,0.0,0.652491,0.652491,0.0,0.385372,0.0


In [None]:
#TF-IDF VECTORIZER combines both steps of CountVectorizer and TfidfTransformer into a single step.