#TF-IDF Workflow (Flow Chart)

```text
Start
  │
  ▼
Step 1: Input Documents
  ["Doc1 text", "Doc2 text", "Doc3 text"]
  │
  ▼
Step 2: Preprocessing
  - Lowercase
  - Tokenize (split into words)
  │
  ▼
Step 3: Vocabulary Creation
  - Collect all unique words across documents
  │
  ▼
Step 4: Term Frequency (TF)
  For each document:
    TF(word, doc) = count(word in doc) / total words in doc
  │
  ▼
Step 5: Inverse Document Frequency (IDF)
  For each word:
    IDF(word) = log( N / (1 + number_of_docs_containing_word) ) + 1
  - Rare words → high IDF
  - Common words → low IDF
  │
  ▼
Step 6: TF × IDF
  For each word in each doc:
    TF-IDF(word, doc) = TF * IDF
  │
  ▼
Step 7: Results
  - Each document now has TF-IDF scores for words
  - Higher score = more important word in that document

# Step 1: Define sample documents

In [None]:
documents = [
    "Natural language processing makes computers understand text",
    "TF IDF is a technique in information retrieval",
    "Language models are a part of natural language processing"
]

# Just to see them clearly
for i, doc in enumerate(documents, 1):
    print(f"Document {i}: {doc}")


# Step 2: Preprocess (lowercase + split into words)

In [None]:
def preprocess(text):
    return text.lower().split()

tokenized_docs = [preprocess(doc) for doc in documents]

print("Tokenized Documents:")
for i, doc in enumerate(tokenized_docs, 1):
    print(f"Doc {i}:", doc)


# Step 3: Build vocabulary (all unique words)

In [None]:
vocab = []
for doc in tokenized_docs:
    for word in doc:
        if word not in vocab:
            vocab.append(word)

print("Vocabulary:", vocab)

# Step 4: Compute Term Frequency (TF)

In [None]:
def compute_tf(doc, vocab):
    tf = {}
    total_words = len(doc)
    for word in vocab:
        tf[word] = doc.count(word) / total_words
    return tf

tfs = [compute_tf(doc, vocab) for doc in tokenized_docs]

print("TF for Document 1:")
print(tfs[0])


# Step 5: Compute Inverse Document Frequency (IDF)

In [None]:
import math
def compute_idf(docs, vocab):
    N = len(docs)
    idf = {}
    for word in vocab:
        containing = 0
        for doc in docs:
            if word in doc:
                containing += 1
        idf[word] = math.log(N / (1 + containing)) + 1
    return idf

idf = compute_idf(tokenized_docs, vocab)

print("IDF values:")
print(idf)


# Step 6: Compute TF-IDF

In [None]:

def compute_tfidf(tf, idf):
    tfidf = {}
    for word, value in tf.items():
        tfidf[word] = value * idf[word]
    return tfidf

tfidf_docs = [compute_tfidf(tf, idf) for tf in tfs]

print("TF-IDF for Document 1:")
print(tfidf_docs[0])


# Step 7: Print results nicely

In [None]:
for i, doc_tfidf in enumerate(tfidf_docs):
    print(f"\nDocument {i+1}:")
    for word, score in doc_tfidf.items():
        if score > 0:
            print(f"{word:15} -> {score:.4f}")
