In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Sample processed text
documents = [
    "Natural Language Processing is a field of Artificial Intelligence.",
    "Machine Learning is a part of Artificial Intelligence.",
    "Natural Language Processing and Machine Learning are closely related."
]

# CountVectorizer: Extracts term frequency (TF)
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(documents)
print("Term Frequency (TF) Matrix:")
print(count_matrix.toarray())
print("Feature Names:", count_vectorizer.get_feature_names_out())

# TfidfVectorizer: Extracts Term Frequency-Inverse Document Frequency (TF-IDF)
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
print("\nTF-IDF Matrix:")
print(tfidf_matrix.toarray())
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

# Analysis: Display terms and their corresponding TF and TF-IDF scores
print("\nFeature Analysis:")
for i, doc in enumerate(documents):
    print(f"\nDocument {i + 1}: {doc}")
    print("TF Scores:")
    for term, score in zip(count_vectorizer.get_feature_names_out(), count_matrix.toarray()[i]):
        print(f"  {term}: {score}")
    print("TF-IDF Scores:")
    for term, score in zip(tfidf_vectorizer.get_feature_names_out(), tfidf_matrix.toarray()[i]):
        print(f"  {term}: {score:.4f}")

Term Frequency (TF) Matrix:
[[0 0 1 0 1 1 1 1 0 0 1 1 0 1 0]
 [0 0 1 0 0 1 1 0 1 1 0 1 1 0 0]
 [1 1 0 1 0 0 0 1 1 1 1 0 0 1 1]]
Feature Names: ['and' 'are' 'artificial' 'closely' 'field' 'intelligence' 'is' 'language'
 'learning' 'machine' 'natural' 'of' 'part' 'processing' 'related']

TF-IDF Matrix:
[[0.         0.         0.33846987 0.         0.44504721 0.33846987
  0.33846987 0.33846987 0.         0.         0.33846987 0.33846987
  0.         0.33846987 0.        ]
 [0.         0.         0.35970039 0.         0.         0.35970039
  0.35970039 0.         0.35970039 0.35970039 0.         0.35970039
  0.47296278 0.         0.        ]
 [0.38091445 0.38091445 0.         0.38091445 0.         0.
  0.         0.28969526 0.28969526 0.28969526 0.28969526 0.
  0.         0.28969526 0.38091445]]
Feature Names: ['and' 'are' 'artificial' 'closely' 'field' 'intelligence' 'is' 'language'
 'learning' 'machine' 'natural' 'of' 'part' 'processing' 'related']

Feature Analysis:

Document 1: Natural