In [5]:
corpus = [
    'the sun is a star',
    'the moon is a satellite',
    'the sun and moon are celestial bodies'
]

In [6]:
import math
from collections import defaultdict
import pandas as pd

def tokenize(doc):
    return doc.lower().split()

def tf(corpus_tokenized):
    tf = []
    for tokens in corpus_tokenized:
        total_terms = len(tokens)
        counts = defaultdict(int)
        for token in tokens:
            counts[token] += 1
        tf.append({word: count / total_terms
                   for word, count in counts.items()})
    return tf

def df(corpus_tokenized):
    df = defaultdict(int)
    for tokens in corpus_tokenized:
        unique_words = set(tokens)
        for word in unique_words:
            df[word] += 1
    return df

def idf(df, N):
    idf = {}
    for word, freq in df.items():
        idf[word] = math.log(N / freq)
    return idf

def tfidf(tf, idf):
    tfidf = []
    for doc_tf in tf:
        doc_tfidf = {word: doc_tf[word] * idf[word] for word in doc_tf}
        tfidf.append(doc_tfidf)
    return tfidf

corpus_tokenized = [tokenize(doc) for doc in corpus]
tf = tf(corpus_tokenized)
df = df(corpus_tokenized)
idf = idf(df, len(corpus))
tfidf_manual = tfidf(tf, idf)

tfidf_df = pd.DataFrame(tfidf_manual).fillna(0)
print(tfidf_df)



   the       sun        is         a      star      moon  satellite       and  \
0  0.0  0.081093  0.081093  0.081093  0.219722  0.000000   0.000000  0.000000   
1  0.0  0.000000  0.081093  0.081093  0.000000  0.081093   0.219722  0.000000   
2  0.0  0.057924  0.000000  0.000000  0.000000  0.057924   0.000000  0.156945   

        are  celestial    bodies  
0  0.000000   0.000000  0.000000  
1  0.000000   0.000000  0.000000  
2  0.156945   0.156945  0.156945  


In [12]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer = CountVectorizer()
vectorizer.fit(corpus)
vector = vectorizer.transform(corpus)
cv_vocabulary = vectorizer.get_feature_names_out()
cv_df = pd.DataFrame(vector.toarray(), columns=cv_vocabulary)
print(cv_df)

tfidf = TfidfVectorizer()
result = tfidf.fit_transform(corpus)
tfidf_vocabulary = tfidf.get_feature_names_out()
tfidf_df = pd.DataFrame(result.toarray(), columns=tfidf_vocabulary)
print(tfidf_df)

   and  are  bodies  celestial  is  moon  satellite  star  sun  the
0    0    0       0          0   1     0          0     1    1    1
1    0    0       0          0   1     1          1     0    0    1
2    1    1       1          1   0     1          0     0    1    1
        and       are    bodies  celestial        is      moon  satellite  \
0  0.000000  0.000000  0.000000   0.000000  0.480458  0.000000   0.000000   
1  0.000000  0.000000  0.000000   0.000000  0.480458  0.480458   0.631745   
2  0.426184  0.426184  0.426184   0.426184  0.000000  0.324124   0.000000   

       star       sun       the  
0  0.631745  0.480458  0.373119  
1  0.000000  0.000000  0.373119  
2  0.000000  0.324124  0.251711  
