# Implement a program for retrieval of documents using inverted files.

In [1]:
# **Imports:**
# - `defaultdict` from `collections`: A dictionary subclass that provides a default value for non-existing keys (in this case, a list).
# - `nltk`: Natural Language Toolkit for text processing.
# - `word_tokenize`: Tokenizes text into words.
# - `stopwords`: Contains common words like 'is', 'and', 'the', etc., to be filtered out.

from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# **Download NLTK resources:**
# - 'punkt': Data needed for tokenizing text.
# - 'stopwords': List of stopwords for English language.
nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    """
    Preprocesses text by tokenizing, converting to lowercase, and removing stopwords.
    
    Parameters:
    - text (str): The text to preprocess.
    
    Returns:
    - List of tokens after processing (filtered, lowercased, alphanumeric).
    """
    tokens = word_tokenize(text.lower())  # Tokenize text and convert to lowercase.
    stop_words = set(stopwords.words("english"))  # Get English stopwords.
    return [token for token in tokens if token.isalnum() and token not in stop_words]  # Filter tokens.

def build_inverted_index(documents):
    """
    Builds an inverted index from a list of documents.
    
    Parameters:
    - documents (list of str): List where each item is a document.
    
    Returns:
    - dict: Inverted index mapping terms (tokens) to lists of document IDs where they appear.
    """
    inverted_index = defaultdict(list)  # Initialize a defaultdict to store lists of document IDs.
    for doc_id, text in enumerate(documents):  # Iterate over documents with IDs.
        tokens = preprocess_text(text)  # Preprocess each document text.
        for token in tokens:
            if doc_id not in inverted_index[token]:  # Avoid duplicate entries for the same document.
                inverted_index[token].append(doc_id)  # Add document ID to the list for the token.
    return inverted_index

def search_documents(query, inverted_index):
    """
    Searches for documents that contain all terms in the query.
    
    Parameters:
    - query (str): The search query containing terms to find.
    - inverted_index (dict): The inverted index mapping terms to document IDs.
    
    Returns:
    - list of int: Sorted list of document IDs that contain all query terms.
    """
    query_tokens = preprocess_text(query)  # Preprocess the search query.
    if not query_tokens:  # If no valid query tokens, return an empty list.
        return []

    # Find sets of document IDs for each query term present in the inverted index.
    result_sets = [set(inverted_index[token]) for token in query_tokens if token in inverted_index]
    
    # Intersect the sets to find documents that contain all query terms.
    if result_sets:
        result_docs = set.intersection(*result_sets)
    else:
        result_docs = set()  # No matching documents found.

    return sorted(result_docs)  # Return sorted list of document IDs.

# **Example Usage:**

# List of example documents.
documents = [
    "Artificial Intelligence is transforming industries.",
    "Machine learning allows computers to learn without programming.",
    "Artificial Intelligence can diagnose diseases.",
    "Machine learning and AI are revolutionizing technology."
]

# Step 1: Build the inverted index from the list of documents.
inverted_index = build_inverted_index(documents)
print("Inverted Index:", dict(inverted_index))

# Step 2: Search for documents using the inverted index with a query.
query = "Artificial Intelligence"
result = search_documents(query, inverted_index)
print(f"\nDocuments containing '{query}':", result)

# **Explanation:**
# - The inverted index is a dictionary where each token maps to a list of document IDs where it appears.
# - The `search_documents()` function finds documents containing all terms from the query by intersecting document ID sets.
# - This method is commonly used in search engines and information retrieval systems for efficient text search and indexing.


Inverted Index: {'artificial': [0, 2], 'intelligence': [0, 2], 'transforming': [0], 'industries': [0], 'machine': [1, 3], 'learning': [1, 3], 'allows': [1], 'computers': [1], 'learn': [1], 'without': [1], 'programming': [1], 'diagnose': [2], 'diseases': [2], 'ai': [3], 'revolutionizing': [3], 'technology': [3]}

Documents containing 'Artificial Intelligence': [0, 2]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khata\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khata\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
