In [43]:
import numpy as np
from collections import defaultdict
from math import log

In [44]:
# 3 sentence sample corpus
sample_corpus = [
    "In the magical forest, the wizard cast a spell to protect the enchanted tree",
    "The enchanted tree glowed brightly, illuminating the entire forest with its magical light",
    "As the forest thrived, the wizard watched over the enchanted tree, ensuring its magical essence remained strong"
]

In [45]:
# Data preprocessing

def data_preprocessing(text):
    return [word.lower() for word in text.split() if word.isalpha()]


In [46]:
# Compute Term Freaquency

def compute_term_frequency(corpus):
    tf = []
    for document in corpus:
        word_count = defaultdict(int)
        words = data_preprocessing(document)
        for word in words:
            word_count[word] += 1
        tf.append(
            {word:count/len(words) for word , count in word_count.items()}
        )
    return tf

In [47]:
# Compute Inverse Document Frequency (IDF)

def compute_inverse__document_frequency(corpus):
    idf = {}
    total_documents = len(corpus)
    all_words = set(word for document in corpus for word in data_preprocessing(document))
    for word in all_words:
        containing_docs = sum(1 for document in corpus if word in data_preprocessing(document))
        idf[word] = log(total_documents + (1/containing_docs)) + 1
    return idf

In [48]:
# Compute TF-IDF

def compute_tf_idf(tf , idf):
    tf_idf = []
    for document_tf in tf:
        document_tf_idf = {word: tf_value *idf[word] for word, tf_value in document_tf.items()}
        tf_idf.append(document_tf_idf)
    return tf_idf

In [49]:
tf = compute_term_frequency(corpus = sample_corpus)
print(tf)

[{'in': 0.07692307692307693, 'the': 0.23076923076923078, 'magical': 0.07692307692307693, 'wizard': 0.07692307692307693, 'cast': 0.07692307692307693, 'a': 0.07692307692307693, 'spell': 0.07692307692307693, 'to': 0.07692307692307693, 'protect': 0.07692307692307693, 'enchanted': 0.07692307692307693, 'tree': 0.07692307692307693}, {'the': 0.16666666666666666, 'enchanted': 0.08333333333333333, 'tree': 0.08333333333333333, 'glowed': 0.08333333333333333, 'illuminating': 0.08333333333333333, 'entire': 0.08333333333333333, 'forest': 0.08333333333333333, 'with': 0.08333333333333333, 'its': 0.08333333333333333, 'magical': 0.08333333333333333, 'light': 0.08333333333333333}, {'as': 0.06666666666666667, 'the': 0.2, 'forest': 0.06666666666666667, 'wizard': 0.06666666666666667, 'watched': 0.06666666666666667, 'over': 0.06666666666666667, 'enchanted': 0.06666666666666667, 'ensuring': 0.06666666666666667, 'its': 0.06666666666666667, 'magical': 0.06666666666666667, 'essence': 0.06666666666666667, 'remaine

In [50]:
idf = compute_inverse__document_frequency(corpus=sample_corpus)
print(idf)

{'spell': 2.386294361119891, 'strong': 2.386294361119891, 'enchanted': 2.203972804325936, 'essence': 2.386294361119891, 'its': 2.252762968495368, 'over': 2.386294361119891, 'a': 2.386294361119891, 'to': 2.386294361119891, 'wizard': 2.252762968495368, 'protect': 2.386294361119891, 'illuminating': 2.386294361119891, 'entire': 2.386294361119891, 'in': 2.386294361119891, 'the': 2.203972804325936, 'forest': 2.252762968495368, 'watched': 2.386294361119891, 'light': 2.386294361119891, 'tree': 2.252762968495368, 'cast': 2.386294361119891, 'magical': 2.203972804325936, 'as': 2.386294361119891, 'with': 2.386294361119891, 'ensuring': 2.386294361119891, 'glowed': 2.386294361119891, 'remained': 2.386294361119891}


In [51]:
tf_idf = compute_tf_idf(tf = tf , idf = idf )
print(tf_idf)

[{'in': 0.18356110470153006, 'the': 0.5086091086906006, 'magical': 0.16953636956353355, 'wizard': 0.17328945911502833, 'cast': 0.18356110470153006, 'a': 0.18356110470153006, 'spell': 0.18356110470153006, 'to': 0.18356110470153006, 'protect': 0.18356110470153006, 'enchanted': 0.16953636956353355, 'tree': 0.17328945911502833}, {'the': 0.3673288007209893, 'enchanted': 0.18366440036049464, 'tree': 0.187730247374614, 'glowed': 0.19885786342665757, 'illuminating': 0.19885786342665757, 'entire': 0.19885786342665757, 'forest': 0.187730247374614, 'with': 0.19885786342665757, 'its': 0.187730247374614, 'magical': 0.18366440036049464, 'light': 0.19885786342665757}, {'as': 0.15908629074132605, 'the': 0.4407945608651872, 'forest': 0.1501841978996912, 'wizard': 0.1501841978996912, 'watched': 0.15908629074132605, 'over': 0.15908629074132605, 'enchanted': 0.14693152028839573, 'ensuring': 0.15908629074132605, 'its': 0.1501841978996912, 'magical': 0.14693152028839573, 'essence': 0.15908629074132605, 'rem

In [52]:
# The results

for i , document_tf_idf in enumerate(tf_idf):
    print(f"Document {i+1} TF-IDF is : ")
    for word , score in document_tf_idf.items():
        print(f"{word} : {score}")
    print("*"*50)


Document 1 TF-IDF is : 
in : 0.18356110470153006
the : 0.5086091086906006
magical : 0.16953636956353355
wizard : 0.17328945911502833
cast : 0.18356110470153006
a : 0.18356110470153006
spell : 0.18356110470153006
to : 0.18356110470153006
protect : 0.18356110470153006
enchanted : 0.16953636956353355
tree : 0.17328945911502833
**************************************************
Document 2 TF-IDF is : 
the : 0.3673288007209893
enchanted : 0.18366440036049464
tree : 0.187730247374614
glowed : 0.19885786342665757
illuminating : 0.19885786342665757
entire : 0.19885786342665757
forest : 0.187730247374614
with : 0.19885786342665757
its : 0.187730247374614
magical : 0.18366440036049464
light : 0.19885786342665757
**************************************************
Document 3 TF-IDF is : 
as : 0.15908629074132605
the : 0.4407945608651872
forest : 0.1501841978996912
wizard : 0.1501841978996912
watched : 0.15908629074132605
over : 0.15908629074132605
enchanted : 0.14693152028839573
ensuring : 0.15908