In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
# Sample collection of text documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Bag of Words

In [3]:
# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the documents and transform the documents into a BoW representation
X = vectorizer.fit_transform(documents)

# Convert the BoW representation to a dense matrix (if needed)
dense_matrix = X.toarray()

# Get the feature names (words) in the vocabulary
feature_names = vectorizer.get_feature_names_out()

# Display the BoW matrix and feature names
print("Bag of Words Matrix:")
print(dense_matrix)
print("\nFeature Names:")
print(feature_names)


Bag of Words Matrix:
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

Feature Names:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']


# TF-IDF

In [4]:
# Create a TfidfVectorizer instance
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the documents to create the TF-IDF representation
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert the TF-IDF representation to an array (if needed)
tfidf_array = tfidf_matrix.toarray()

# Get the feature (word) names
feature_names = tfidf_vectorizer.get_feature_names_out()

# Display the TF-IDF representation and feature names
print("TF-IDF Representation:")
print(tfidf_array)
print("\nFeature Names:")
print(feature_names)


TF-IDF Representation:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

Feature Names:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
