In [7]:
from collections import defaultdict
import re

In [8]:
 documents = {
    "D1": "Programmers program with Python and Java",
    "D2": "Python is popular for machine learning",
    "D3": "Java is used for backend programming",
    "D4": "Machine learning and deep learning are related"
 }

In [9]:
# Step 1: Preprocess text (tokenization + lowercase + remove punctuation)
def preprocess(text):
    text = text.lower()
    words = re.findall(r'\w+', text)  # keep only words
    return words

In [10]:
# Step 2: Build inverted index
inverted_index = defaultdict(set)
for doc_id, text in documents.items():
    words = preprocess(text)
    for word in words:
        inverted_index[word].add(doc_id)

In [11]:
# Step 3: Retrieval function (AND search)
def search(query):
    query_words = preprocess(query)
    if not query_words:
        return []
    
    # Start with docs for the first word
    result_docs = inverted_index.get(query_words[0], set()).copy()
    
    # Intersect with docs for remaining words
    for word in query_words[1:]:
        result_docs &= inverted_index.get(word, set())
    
    return list(result_docs)


In [12]:
# Output
print("Inverted Index:")
for term, docs in inverted_index.items():
    print(term, ":", docs)

print("\nSearch Results:")
print("Query: 'python'            →", search("python"))
print("Query: 'machine learning'  →", search("machine learning"))
print("Query: 'java backend'      →", search("java backend"))

Inverted Index:
programmers : {'D1'}
program : {'D1'}
with : {'D1'}
python : {'D1', 'D2'}
and : {'D4', 'D1'}
java : {'D1', 'D3'}
is : {'D3', 'D2'}
popular : {'D2'}
for : {'D3', 'D2'}
machine : {'D4', 'D2'}
learning : {'D4', 'D2'}
used : {'D3'}
backend : {'D3'}
programming : {'D3'}
deep : {'D4'}
are : {'D4'}
related : {'D4'}

Search Results:
Query: 'python'            → ['D1', 'D2']
Query: 'machine learning'  → ['D4', 'D2']
Query: 'java backend'      → ['D3']
