IR pr 2

Implement a program for retrieval of documents using inverted files.

In [6]:
from collections import defaultdict
import re

# Sample document collection
documents = {
    "D1": "Programmers program with Python and Java",
    "D2": "Python is popular for machine learning",
    "D3": "Java is used for backend programming",
    "D4": "Machine learning and deep learning are related"
}


In [7]:
# Preprocess text: lowercase + tokenization + remove punctuation
def preprocess(text):
    text = text.lower()
    words = re.findall(r'\w+', text)  # keep only words
    return words


In [8]:
inverted_index = defaultdict(set)

for doc_id, text in documents.items():
    words = preprocess(text)
    for word in words:
        inverted_index[word].add(doc_id)


In [9]:
def search(query):
    query_words = preprocess(query)
    if not query_words:
        return []

    # Start with docs for the first word
    result_docs = inverted_index.get(query_words[0], set()).copy()

    # Intersect with docs for remaining words
    for word in query_words[1:]:
        result_docs &= inverted_index.get(word, set())

    return list(result_docs)


In [10]:
print("Inverted Index:")
for term, docs in inverted_index.items():
    print(term, ":", docs)

print("\nSearch Results:")
print("Query: 'python'            →", search("python"))
print("Query: 'machine learning'  →", search("machine learning"))
print("Query: 'java backend'      →", search("java backend"))


Inverted Index:
programmers : {'D1'}
program : {'D1'}
with : {'D1'}
python : {'D2', 'D1'}
and : {'D4', 'D1'}
java : {'D3', 'D1'}
is : {'D3', 'D2'}
popular : {'D2'}
for : {'D3', 'D2'}
machine : {'D4', 'D2'}
learning : {'D4', 'D2'}
used : {'D3'}
backend : {'D3'}
programming : {'D3'}
deep : {'D4'}
are : {'D4'}
related : {'D4'}

Search Results:
Query: 'python'            → ['D2', 'D1']
Query: 'machine learning'  → ['D4', 'D2']
Query: 'java backend'      → ['D3']
