In [1]:
import math
import re
from collections import Counter

# Documents
docs = [
    "The sun is shining brightly shining shining today.",
    "A bright sun brings happiness.",
    "The cat is sleeping on the mat.",
    "A cat and dog are good friends."
]

# Preprocess: lowercase + remove punctuation
def preprocess(text):
    return re.findall(r'\b\w+\b', text.lower())

processed = [preprocess(doc) for doc in docs]

# Compute TF (term frequency)
def tf(words):
    counts = Counter(words)
    total = len(words)
    return {w: c / total for w, c in counts.items()}

# Compute IDF (inverse document frequency)
def idf(processed):
    N = len(processed)
    all_words = set(w for doc in processed for w in doc)
    return {w: math.log(N / (1 + sum(w in doc for doc in processed))) for w in all_words}

# Compute TF-IDF
def tfidf(processed):
    idf_scores = idf(processed)
    return [{w: tf_doc[w] * idf_scores[w] for w in tf_doc} for tf_doc in map(tf, processed)]

# Run and display
results = tfidf(processed)
print("\n--- TF-IDF Scores ---")
for i, doc in enumerate(results, 1):
    print(f"\nDocument {i}:")
    for w, s in sorted(doc.items(), key=lambda x: x[1], reverse=True):
        print(f" {w:<10} : {s:.4f}")



--- TF-IDF Scores ---

Document 1:
 shining    : 0.2599
 brightly   : 0.0866
 today      : 0.0866
 the        : 0.0360
 sun        : 0.0360
 is         : 0.0360

Document 2:
 bright     : 0.1386
 brings     : 0.1386
 happiness  : 0.1386
 a          : 0.0575
 sun        : 0.0575

Document 3:
 sleeping   : 0.0990
 on         : 0.0990
 mat        : 0.0990
 the        : 0.0822
 cat        : 0.0411
 is         : 0.0411

Document 4:
 and        : 0.0990
 dog        : 0.0990
 are        : 0.0990
 good       : 0.0990
 friends    : 0.0990
 a          : 0.0411
 cat        : 0.0411
