In [1]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [2]:
import math
from collections import Counter

tokenized_corpus = [doc.lower().split() for doc in corpus]

vocab = sorted(set(word for doc in tokenized_corpus for word in doc))
print("Vocabulary:", vocab)

df = {word: sum(1 for doc in tokenized_corpus if word in doc) for word in vocab}
print("\nDocument Frequencies:", df)

tfidf_manual = []

for doc in tokenized_corpus:
    word_count = Counter(doc)
    doc_len = len(doc)
    tfidf_vector = []

    for word in vocab:
        tf = word_count[word] / doc_len
        idf = math.log(len(corpus) / (df[word]))
        tfidf = tf * idf
        tfidf_vector.append(round(tfidf, 4))

    tfidf_manual.append(tfidf_vector)

print("\nManual TF-IDF Matrix:")
for row in tfidf_manual:
    print(row)

Vocabulary: ['a', 'and', 'are', 'bodies', 'celestial', 'is', 'moon', 'satellite', 'star', 'sun', 'the']

Document Frequencies: {'a': 2, 'and': 1, 'are': 1, 'bodies': 1, 'celestial': 1, 'is': 2, 'moon': 2, 'satellite': 1, 'star': 1, 'sun': 2, 'the': 3}

Manual TF-IDF Matrix:
[0.0811, 0.0, 0.0, 0.0, 0.0, 0.0811, 0.0, 0.0, 0.2197, 0.0811, 0.0]
[0.0811, 0.0, 0.0, 0.0, 0.0, 0.0811, 0.0811, 0.2197, 0.0, 0.0, 0.0]
[0.0, 0.1569, 0.1569, 0.1569, 0.1569, 0.0, 0.0579, 0.0, 0.0, 0.0579, 0.0]


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

count_vect = CountVectorizer()
X_count = count_vect.fit_transform(corpus)
print("Vocabulary:", count_vect.get_feature_names_out())
print("\nCount Vectorizer Matrix:\n", X_count.toarray())

tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(corpus)
print("\nTF-IDF Vectorizer Matrix:\n", X_tfidf.toarray())

Vocabulary: ['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']

Count Vectorizer Matrix:
 [[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]

TF-IDF Vectorizer Matrix:
 [[0.         0.         0.         0.         0.4804584  0.
  0.         0.63174505 0.4804584  0.37311881]
 [0.         0.         0.         0.         0.4804584  0.4804584
  0.63174505 0.         0.         0.37311881]
 [0.4261835  0.4261835  0.4261835  0.4261835  0.         0.32412354
  0.         0.         0.32412354 0.25171084]]
