## Bag Of Words

In [20]:
import numpy as np
import pandas as pd

In [21]:
df = pd.DataFrame({'text':['people watch campusx','campusx watch campusx','people write comment','campusx write comment'], 'output':[1,1,0,0]})

In [22]:
df

Unnamed: 0,text,output
0,people watch campusx,1
1,campusx watch campusx,1
2,people write comment,0
3,campusx write comment,0


In [23]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [24]:
bow = cv.fit_transform(df['text'])
#It does two things at once:
# 🔧 1. fit: Learn the vocabulary
# 🧱 2. transform: Convert each sentence into a vector

In [25]:
# vocab
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'campusx': 0, 'write': 4, 'comment': 1}


In [26]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

[[1 0 1 1 0]]
[[2 0 0 1 0]]
[[0 1 1 0 1]]


In [27]:
cv.transform(["Campus watch and write comment"]).toarray()

array([[0, 1, 0, 1, 1]])

In [28]:
cv.transform(["campusx watch and write comment of campusx"]).toarray()

array([[2, 1, 0, 1, 1]])

https://scikit-learn.org/0.15/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html more info how to use code of BOW

## N-grams

In [36]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2, 2)) #it will group the 2 words together

In [37]:
bow = cv.fit_transform(df['text'])

In [38]:
print(len(cv.vocabulary_))

6


In [39]:
print(cv.vocabulary_)

{'people watch': 2, 'watch campusx': 4, 'campusx watch': 0, 'people write': 3, 'write comment': 5, 'campusx write': 1}


In [40]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

[[0 0 1 0 1 0]]
[[1 0 0 0 1 0]]
[[0 0 0 1 0 1]]


In [41]:
cv.transform(["Campus watch and write comment"]).toarray()

array([[0, 0, 0, 0, 0, 1]])

In [42]:
cv.transform(["campusx watch and write comment of campusx"]).toarray()

array([[1, 0, 0, 0, 0, 1]])

Likewise we can see set the parameter ngram_range=(3, 3) for grouping 3 words together, or ngram_range=(1, 3) it will group the single and triple word that means word lenght will be increased. We can check the lenght by this len(cv.vocabulary_)

## Tf-Idf

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
tfidf = TfidfVectorizer()
tfidf.fit_transform(df['text']).toarray()

array([[0.49681612, 0.        , 0.61366674, 0.61366674, 0.        ],
       [0.8508161 , 0.        , 0.        , 0.52546357, 0.        ],
       [0.        , 0.57735027, 0.57735027, 0.        , 0.57735027],
       [0.49681612, 0.61366674, 0.        , 0.        , 0.61366674]])

In [45]:
print(tfidf.idf_)

[1.22314355 1.51082562 1.51082562 1.51082562 1.51082562]


In [47]:
print(tfidf.get_feature_names_out())

['campusx' 'comment' 'people' 'watch' 'write']
