In [1]:
import math

# 1. Our Dataset
docA = "the cat sat on my face"
docB = "the dog sat on my bed"

# Split documents into word lists
bowA = docA.split(" ")
bowB = docB.split(" ")
wordSet = set(bowA).union(set(bowB)) # Get all unique words

# --- Step 1: Compute TF (Term Frequency) ---
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bowCount)
    return tfDict

# Count words in each document
wordDictA = dict.fromkeys(wordSet, 0)
wordDictB = dict.fromkeys(wordSet, 0)

for word in bowA:
    wordDictA[word] += 1

for word in bowB:
    wordDictB[word] += 1

tfA = computeTF(wordDictA, bowA)
tfB = computeTF(wordDictB, bowB)

# --- Step 2: Compute IDF (Inverse Document Frequency) ---
def computeIDF(docList):
    idfDict = {}
    N = len(docList)

    # Count how many documents contain the word 't'
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1

    # Apply the IDF formula: log(Total Docs / Docs with word)
    for word, val in idfDict.items():
        if val > 0:
            idfDict[word] = math.log(N / float(val))
        else:
            idfDict[word] = 0

    return idfDict

idfs = computeIDF([wordDictA, wordDictB])

# --- Step 3: Compute TF-IDF ---
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

# Print Results
import pandas as pd
df = pd.DataFrame([tfidfA, tfidfB])
print(df)

    my      face       bed  sat       dog   on       cat  the
0  0.0  0.115525  0.000000  0.0  0.000000  0.0  0.115525  0.0
1  0.0  0.000000  0.115525  0.0  0.115525  0.0  0.000000  0.0
