Bag of Words

In [21]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer # a tool which converts text to matrix of "token count"

In [22]:
sentences = [
    "The cat jumped onto the windowsill to watch the rain.",
    "I forgot my umbrella and got completely soaked.",
    "She baked a chocolate cake for her friend's birthday.",
    "The movie was better than I expected, honestly.",
    "They went hiking despite the storm warnings.",
    "He listens to jazz while working on his paintings."
]


In [23]:
countvec = CountVectorizer()
countvec_fit = countvec.fit_transform(sentences)
#fit learns the vocabulary.
#transform creates a sparse matrix 


In [24]:
print(countvec_fit) 
# (rows = sentences, columns = unique words, values = word counts in each sentence).
print(countvec_fit.toarray())
# *The size of each sentence row is the number of unique words in the entire dataset,
# *and each value (1 or 0) indicates the existence of that word within that specific sentence.

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 46 stored elements and shape (6, 43)>
  Coords	Values
  (0, 32)	3
  (0, 5)	1
  (0, 20)	1
  (0, 25)	1
  (0, 41)	1
  (0, 34)	1
  (0, 38)	1
  (0, 27)	1
  (1, 11)	1
  (1, 23)	1
  (1, 35)	1
  (1, 0)	1
  (1, 13)	1
  (1, 7)	1
  (1, 29)	1
  (2, 28)	1
  (2, 1)	1
  (2, 6)	1
  (2, 4)	1
  (2, 10)	1
  (2, 15)	1
  (2, 12)	1
  (2, 3)	1
  (3, 32)	1
  (3, 22)	1
  (3, 37)	1
  (3, 2)	1
  (3, 31)	1
  (3, 9)	1
  (3, 18)	1
  (4, 32)	1
  (4, 33)	1
  (4, 39)	1
  (4, 16)	1
  (4, 8)	1
  (4, 30)	1
  (4, 36)	1
  (5, 34)	1
  (5, 14)	1
  (5, 21)	1
  (5, 19)	1
  (5, 40)	1
  (5, 42)	1
  (5, 24)	1
  (5, 17)	1
  (5, 26)	1
[[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 3 0 1 0
  0 0 1 0 0 1 0]
 [1 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1
  0 0 0 0 0 0 0]
 [0 1 0 1 1 0 1 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
  0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0

In [25]:
bag_of_words = pd.DataFrame(countvec_fit.toarray(), columns = countvec.get_feature_names_out())
# "get_feature_names_out()" gets the original word so "columns" is token
# "row" is each sentence
print(bag_of_words)

   and  baked  better  birthday  cake  cat  chocolate  completely  despite  \
0    0      0       0         0     0    1          0           0        0   
1    1      0       0         0     0    0          0           1        0   
2    0      1       0         1     1    0          1           0        0   
3    0      0       1         0     0    0          0           0        0   
4    0      0       0         0     0    0          0           0        1   
5    0      0       0         0     0    0          0           0        0   

0         0  ...     0   1         0         0    0      1     0      0   
1         0  ...     0   0         1         0    0      0     0      0   
2         0  ...     0   0         0         0    0      0     0      0   
3         1  ...     0   0         0         0    1      0     0      0   
4         0  ...     1   0         0         1    0      0     1      0   
5         0  ...     0   1         0         0    0      0     0      1   

  

TF_IDF: (Occurence of each word) / (the total number of words)

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
tfidvec = TfidfVectorizer()

In [33]:
tfidvec_fit = tfidvec.fit_transform(sentences)
bag_of_words = pd.DataFrame(tfidvec_fit.toarray(), columns=tfidvec.get_feature_names_out())

In [34]:
print(bag_of_words)

        and     baked    better  birthday      cake       cat  chocolate  \
0  0.000000  0.000000  0.000000  0.000000  0.000000  0.301702   0.000000   
1  0.377964  0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   
2  0.000000  0.353553  0.000000  0.353553  0.353553  0.000000   0.353553   
3  0.000000  0.000000  0.392858  0.000000  0.000000  0.000000   0.000000   
4  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   
5  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   0.000000   

   completely   despite  expected  ...      they        to  umbrella  \
0    0.000000  0.000000  0.000000  ...  0.000000  0.247400  0.000000   
1    0.377964  0.000000  0.000000  ...  0.000000  0.000000  0.377964   
2    0.000000  0.000000  0.000000  ...  0.000000  0.000000  0.000000   
3    0.000000  0.000000  0.392858  ...  0.000000  0.000000  0.000000   
4    0.000000  0.392858  0.000000  ...  0.392858  0.000000  0.000000   
5    0.000000  0.000000  0.000000  