In [1]:
import math
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [2]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [3]:
tokenized_corpus = [doc.lower().split() for doc in corpus]


In [4]:
vocab = sorted(set(word for doc in tokenized_corpus for word in doc))


In [5]:
def compute_tf(doc):
    tf = {}
    for word in vocab:
        tf[word] = doc.count(word) / len(doc)
    return tf

tf_list = [compute_tf(doc) for doc in tokenized_corpus]

In [6]:
def compute_df():
    df = {}
    for word in vocab:
        df[word] = sum(word in doc for doc in tokenized_corpus)
    return df

df = compute_df()

In [7]:
N = len(corpus)
idf = {}
for word in vocab:
    idf[word] = math.log(N / df[word]) if df[word] else 0.0

In [8]:
tfidf_manual = []
for tf in tf_list:
    tfidf = {}
    for word in vocab:
        tfidf[word] = round(tf[word] * idf[word], 4)
    tfidf_manual.append(tfidf)

print("Manual TF-IDF:")
for i, doc in enumerate(tfidf_manual):
    print(f"Doc {i+1}: {doc}")

Manual TF-IDF:
Doc 1: {'a': 0.0811, 'and': 0.0, 'are': 0.0, 'bodies': 0.0, 'celestial': 0.0, 'is': 0.0811, 'moon': 0.0, 'satellite': 0.0, 'star': 0.2197, 'sun': 0.0811, 'the': 0.0}
Doc 2: {'a': 0.0811, 'and': 0.0, 'are': 0.0, 'bodies': 0.0, 'celestial': 0.0, 'is': 0.0811, 'moon': 0.0811, 'satellite': 0.2197, 'star': 0.0, 'sun': 0.0, 'the': 0.0}
Doc 3: {'a': 0.0, 'and': 0.1569, 'are': 0.1569, 'bodies': 0.1569, 'celestial': 0.1569, 'is': 0.0, 'moon': 0.0579, 'satellite': 0.0, 'star': 0.0, 'sun': 0.0579, 'the': 0.0}


In [9]:
print("\nscikit-learn CountVectorizer:")
cv = CountVectorizer()
cv_matrix = cv.fit_transform(corpus)
print(cv.get_feature_names_out())
print(cv_matrix.toarray())


scikit-learn CountVectorizer:
['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
[[0 0 0 0 1 0 0 1 1 1]
 [0 0 0 0 1 1 1 0 0 1]
 [1 1 1 1 0 1 0 0 1 1]]


In [10]:
print("\nscikit-learn TfidfVectorizer:")
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(corpus)
print(tfidf.get_feature_names_out())
print(tfidf_matrix.toarray().round(3))


scikit-learn TfidfVectorizer:
['and' 'are' 'bodies' 'celestial' 'is' 'moon' 'satellite' 'star' 'sun'
 'the']
[[0.    0.    0.    0.    0.48  0.    0.    0.632 0.48  0.373]
 [0.    0.    0.    0.    0.48  0.48  0.632 0.    0.    0.373]
 [0.426 0.426 0.426 0.426 0.    0.324 0.    0.    0.324 0.252]]
