#Some common terms to remember:
### 1. Corpus
### 2. Vocabulary
### 3. Document
### 4. Word

# Bag of words

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({"text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]})


df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [4]:
# Count vectorizer() is the same as Bag of word technique
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [5]:
bow = cv.fit_transform(df['text'])

In [6]:
#vocabulary
print(cv.vocabulary_)

{'people': 2, 'watch': 3, 'dswithbappy': 1, 'write': 4, 'comment': 0}


In [8]:
for key, value in sorted(cv.vocabulary_.items(), key=lambda item: item[1]):
    print(f"{key}: {value}", end = " ")
bow.toarray()

comment: 0 dswithbappy: 1 people: 2 watch: 3 write: 4 

array([[0, 1, 1, 1, 0],
       [0, 2, 0, 1, 0],
       [1, 0, 1, 0, 1],
       [1, 1, 0, 0, 1]])

In [9]:
bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 11 stored elements and shape (4, 5)>

In [10]:
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

[[0 1 1 1 0]]
[[0 2 0 1 0]]
[[1 0 1 0 1]]


In [11]:
# new
cv.transform(['Bappy watch dswithbappy']).toarray()
# No need to fit again as it has been set. Just transform this based on the parameter set previously

array([[0, 1, 0, 1, 0]])

In [12]:
X = bow.toarray()
y = df['output']

#  N-grams
Same as bag of word; difference is that when we use an N-gram parameter it considers n words at a time as a token

In [13]:
df = pd.DataFrame({"text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]})

df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [14]:
# BI grams - 2 words at a time
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(2,2))
# Just pass the values in CountVectorizer

In [15]:
bow = cv.fit_transform(df['text'])

In [16]:
print(cv.vocabulary_)

{'people watch': 2, 'watch dswithbappy': 4, 'dswithbappy watch': 0, 'people write': 3, 'write comment': 5, 'dswithbappy write': 1}


In [18]:
for key, value in sorted(cv.vocabulary_.items(), key=lambda item: item[1]):
    print(f"{key}: {value}", end = " ")
print("\n")
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

dswithbappy watch: 0 dswithbappy write: 1 people watch: 2 people write: 3 watch dswithbappy: 4 write comment: 5 

[[0 0 1 0 1 0]]
[[1 0 0 0 1 0]]
[[0 0 0 1 0 1]]


In [19]:
#Ti gram
# BI grams
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(3,3))

In [20]:
bow = cv.fit_transform(df['text'])

In [21]:
print(cv.vocabulary_)

{'people watch dswithbappy': 2, 'dswithbappy watch dswithbappy': 0, 'people write comment': 3, 'dswithbappy write comment': 1}


In [23]:
for key, value in sorted(cv.vocabulary_.items(), key=lambda item: item[1]):
    print(f"{key}: {value}", end = " ")
print("\n")
print(bow[0].toarray())
print(bow[1].toarray())
print(bow[2].toarray())

dswithbappy watch dswithbappy: 0 dswithbappy write comment: 1 people watch dswithbappy: 2 people write comment: 3 

[[0 0 1 0]]
[[1 0 0 0]]
[[0 0 0 1]]


# TF-IDF (Term frequency - Inverse document frequency)
### More frequency, more weight
Still not following semantic form

In [24]:
df = pd.DataFrame({"text":["people watch dswithbappy",
                         "dswithbappy watch dswithbappy",
                         "people write comment",
                          "dswithbappy write comment"],"output":[1,1,0,0]})

df

Unnamed: 0,text,output
0,people watch dswithbappy,1
1,dswithbappy watch dswithbappy,1
2,people write comment,0
3,dswithbappy write comment,0


In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfid= TfidfVectorizer()

In [26]:
arr = tfid.fit_transform(df['text']).toarray()

In [27]:
arr

array([[0.        , 0.49681612, 0.61366674, 0.61366674, 0.        ],
       [0.        , 0.8508161 , 0.        , 0.52546357, 0.        ],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027],
       [0.61366674, 0.49681612, 0.        , 0.        , 0.61366674]])

In [28]:
print(tfid.idf_)

[1.51082562 1.22314355 1.51082562 1.51082562 1.51082562]
