In [1]:
import pandas as pd
import numpy as np

In [28]:
df = pd.DataFrame({'text':['Bill watch movie',
                     'Movie watch movie',
                     'People write comment',
                     'comment write people'],
             'output':[1,1,0,0]})
df

Unnamed: 0,text,output
0,Bill watch movie,1
1,Movie watch movie,1
2,People write comment,0
3,comment write people,0


## Text Vectorization using BagofWords

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
cv = CountVectorizer()

In [68]:
bow = cv.fit_transform(df['text'])

In [69]:
# vocabulary

cv.vocabulary_

{'this': 6,
 'is': 2,
 'good': 1,
 'place': 5,
 'not': 3,
 'people': 4,
 'write': 7,
 'comment': 0}

In [70]:
print(bow[0].toarray())
print(bow[1].toarray())

[[0 1 1 0 0 1 1 0]]
[[0 1 1 1 0 1 1 0]]


In [76]:
print(cv.transform(['Watch good people And Write Comment']).toarray())
print(cv.transform(['Dont Watch good people And dont Write Comment']).toarray())

[[1 1 0 0 1 0 0 1]]
[[1 1 0 0 1 0 0 1]]


- As we can see above using BOW with unigram the semantic meaning of the sentences are not well captured.
- For that we use BOW with n-grams which can able to capture the semantice meaning of the sentences.

## Using N-grams

In [92]:
df = pd.DataFrame({'text':['This is a good place',
                     'This is not a good place',
                     'People write comment',
                     'Bill watch movie'],
             'output':[1,1,0,0]})
df

Unnamed: 0,text,output
0,This is a good place,1
1,This is not a good place,1
2,People write comment,0
3,Bill watch movie,0


In [93]:
# BOW with bi-gram

cvbi = CountVectorizer(ngram_range=(2,2))

In [94]:
bow = cvbi.fit_transform(df['text'])

In [95]:
print(cvbi.vocabulary_)

{'this is': 6, 'is good': 2, 'good place': 1, 'is not': 3, 'not good': 4, 'people write': 5, 'write comment': 8, 'bill watch': 0, 'watch movie': 7}


In [96]:
print(bow[0].toarray())
print(bow[1].toarray())

[[0 1 1 0 0 0 1 0 0]]
[[0 1 0 1 1 0 1 0 0]]


In [99]:
print(cvbi.transform(['people write comment']).toarray())
print(cvbi.transform(['people watch movie']).toarray())

[[0 0 0 0 0 1 0 0 1]]
[[0 0 0 0 0 0 0 1 0]]


In [100]:
# BOW using tri-grams

cvtri = CountVectorizer(ngram_range=(3,3))

In [101]:
bow = cvtri.fit_transform(df['text'])

In [102]:
print(cvtri.vocabulary_)

{'this is good': 5, 'is good place': 1, 'this is not': 6, 'is not good': 2, 'not good place': 3, 'people write comment': 4, 'bill watch movie': 0}


In [106]:
print(bow[0].toarray())
print(bow[1].toarray())

[[0 1 0 0 0 1 0]]
[[0 0 1 1 0 0 1]]


In [114]:
print(cvtri.transform(['This is a good place and Bill watch movie']).toarray())
print(cvtri.transform(['This is not a good place and Bill dont watch movie']).toarray())

[[1 1 0 0 0 1 0]]
[[0 0 1 1 0 0 1]]


- N-grams is able to capture the semantice meaning of the sentences

## Using TF-IDF

In [115]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [116]:
tfidf = TfidfVectorizer()

In [118]:
tfidf.fit_transform(df['text']).toarray()

array([[0.        , 0.        , 0.5       , 0.5       , 0.        ,
        0.        , 0.        , 0.5       , 0.5       , 0.        ,
        0.        ],
       [0.        , 0.        , 0.4222466 , 0.4222466 , 0.        ,
        0.53556627, 0.        , 0.4222466 , 0.4222466 , 0.        ,
        0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027],
       [0.57735027, 0.        , 0.        , 0.        , 0.57735027,
        0.        , 0.        , 0.        , 0.        , 0.57735027,
        0.        ]])

In [122]:
print(tfidf.get_feature_names())
print(tfidf.idf_)

['bill', 'comment', 'good', 'is', 'movie', 'not', 'people', 'place', 'this', 'watch', 'write']
[1.91629073 1.91629073 1.51082562 1.51082562 1.91629073 1.91629073
 1.91629073 1.51082562 1.51082562 1.91629073 1.91629073]


