In [14]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd
import numpy as np

In [None]:
sentences = [
    "Natural language processing is powerful",
    "Machine learning is powerful",
    "Natural language processing helps machines",
    "Machine learning uses data"
]   

In [10]:
bow_vectorizer = CountVectorizer()
bow = bow_vectorizer.fit_transform(sentences)

In [11]:
print("BoW Matrix:")
print(pd.DataFrame(bow.toarray(), columns=bow_vectorizer.get_feature_names_out()))

BoW Matrix:
   data  helps  is  language  learning  machine  machines  natural  powerful  \
0     0      0   1         1         0        0         0        1         1   
1     0      0   1         0         1        1         0        0         1   
2     0      1   0         1         0        0         1        1         0   
3     1      0   0         0         1        1         0        0         0   

   processing  uses  
0           1     0  
1           0     0  
2           1     0  
3           0     1  


In [12]:
tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(sentences)

In [13]:
print("\nTF-IDF Matrix:")
print(pd.DataFrame(tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out()))


TF-IDF Matrix:
       data     helps        is  language  learning   machine  machines  \
0  0.000000  0.000000  0.447214  0.447214  0.000000  0.000000  0.000000   
1  0.000000  0.000000  0.500000  0.000000  0.500000  0.500000  0.000000   
2  0.000000  0.508672  0.000000  0.401043  0.000000  0.000000  0.508672   
3  0.555283  0.000000  0.000000  0.000000  0.437791  0.437791  0.000000   

    natural  powerful  processing      uses  
0  0.447214  0.447214    0.447214  0.000000  
1  0.000000  0.500000    0.000000  0.000000  
2  0.401043  0.000000    0.401043  0.000000  
3  0.000000  0.000000    0.000000  0.555283  


In [15]:
features = bow_vectorizer.get_feature_names_out()

In [17]:
bow_sum = bow.toarray().sum(axis=0)
tfidf_sum = tfidf.toarray().sum(axis=0)

In [18]:
comparison = pd.DataFrame({
    "Word": features,
    "BoW Total Count": bow_sum,
    "TF-IDF Total Score": tfidf_sum
})

In [19]:
print(comparison.sort_values("TF-IDF Total Score", ascending=False))

          Word  BoW Total Count  TF-IDF Total Score
2           is                2            0.947214
8     powerful                2            0.947214
4     learning                2            0.937791
5      machine                2            0.937791
7      natural                2            0.848256
9   processing                2            0.848256
3     language                2            0.848256
10        uses                1            0.555283
0         data                1            0.555283
1        helps                1            0.508672
6     machines                1            0.508672
