# Implement a program for retrieval of documents using inverted files.

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('punkt')  # 'punkt' is used for tokenizing sentences and words
nltk.download('stopwords')  # English stop words list provided by NLTK

# nltk.download('punkt'): Downloads a pre-trained tokenizer to split text into sentences or words, 
# enabling easier manipulation of text.

# nltk.download('stopwords'): Downloads a list of common,
#  low-value words to exclude from analysis to focus on more meaningful content.

def preprocess_text(text):
    """
    Preprocesses text by tokenizing, converting to lowercase, removing stopwords.
    
    Parameters:
    - text (str): The text to preprocess.
    
    Returns:
    - List of tokens after processing.
    """
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words("english"))
    return [token for token in tokens if token.isalnum() and token not in stop_words]

def build_inverted_index(documents):
    """
    Builds an inverted index from a list of documents.
    
    Parameters:
    - documents (list of str): List where each item is a document.
    
    Returns:
    - dict: Inverted index mapping terms to lists of document IDs.
    """
    inverted_index = {}  # Regular dictionary for storing lists of document IDs
    for doc_id, text in enumerate(documents):
        tokens = preprocess_text(text)
        for token in tokens:
            # Initialize the list if the term is encountered for the first time
            if token not in inverted_index:
                inverted_index[token] = []
            # Avoid duplicates
            if doc_id not in inverted_index[token]:
                inverted_index[token].append(doc_id)
    return inverted_index

def search_documents(query, inverted_index):
    """
    Searches for documents that contain all terms in the query.
    
    Parameters:
    - query (str): The search query containing terms to find.
    - inverted_index (dict): The inverted index.
    
    Returns:
    - list of int: Document IDs that contain all query terms.
    """
    query_tokens = preprocess_text(query)
    if not query_tokens:
        return []

    # Find the list of documents containing each query term
    result_sets = [set(inverted_index[token]) for token in query_tokens if token in inverted_index]
    
    # Intersect the sets to find documents containing all query terms
    if result_sets:
        result_docs = set.intersection(*result_sets)
    else:
        result_docs = set()  # No matching documents
    
    return sorted(result_docs)

# Example documents
documents = [
    "Artificial Intelligence is transforming industries.",
    "Machine learning allows computers to learn without programming.",
    "Artificial Intelligence can diagnose diseases.",
    "Machine learning and AI are revolutionizing technology."
]

# Step 1: Build the inverted index from documents
inverted_index = build_inverted_index(documents)
print("Inverted Index:", dict(inverted_index))

# Step 2: Search for documents using the inverted index
query = "Artificial Intelligence"
result = search_documents(query, inverted_index)
print(f"\nDocuments containing '{query}':", result)


# # Example 

# Documents:
# "the cat sat"
# "the dog sat"
# "the dog barked"
# "the cat meowed"



# Step 1: Tokenize the Documents

# First, we tokenize each document into words (ignore punctuation for simplicity):

# Document 1: ["the", "cat", "sat"]
# Document 2: ["the", "dog", "sat"]
# Document 3: ["the", "dog", "barked"]
# Document 4: ["the", "cat", "meowed"]




# Step 2: Build the Inverted Index

# We now create an inverted index where each word (or term) in the documents is mapped to a list of document
#  IDs where the word appears.

# "the" appears in all documents (1, 2, 3, 4).
# "cat" appears in documents 1 and 4.
# "sat" appears in documents 1 and 2.
# "dog" appears in documents 2 and 3.
# "barked" appears in document 3.
# "meowed" appears in document 4.


# {
#     "the": [1, 2, 3, 4],
#     "cat": [1, 4],
#     "sat": [1, 2],
#     "dog": [2, 3],
#     "barked": [3],
#     "meowed": [4]
# }





# Step 3: Searching with the Inverted Index

# When you perform a search query (for example, the term "dog"), 
# the inverted index allows you to quickly find that 
# "dog" appears in documents 2 and 3 by looking up the term "dog" in the index:

# Query: "dog"
# Inverted Index Lookup: "dog": [2, 3]
# This tells you that "dog" appears in documents 2 and 3. 


Inverted Index: {'artificial': [0, 2], 'intelligence': [0, 2], 'transforming': [0], 'industries': [0], 'machine': [1, 3], 'learning': [1, 3], 'allows': [1], 'computers': [1], 'learn': [1], 'without': [1], 'programming': [1], 'diagnose': [2], 'diseases': [2], 'ai': [3], 'revolutionizing': [3], 'technology': [3]}

Documents containing 'Artificial Intelligence': [0, 2]


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
