In [1]:
# Import necessary modules
from nltk.tokenize import word_tokenize
import nltk

# Download resources for nltk (only needed once)
nltk.download('punkt')

# Define the documents
document1 = "The quick brown fox jumped over the lazy dog ."
document2 = "The lazy dog slept in the sun ."

# Step 1: Tokenize the documents
# Convert each document to lowercase and split it into words
tokens1 = word_tokenize(document1.lower())
tokens2 = word_tokenize(document2.lower())
#tokens1 = document1.lower().split()
#tokens2 = document2.lower().split()

# Remove only full stops (".") from tokens, but keep all other words intact
#tokens1 = [token for token in tokens1 if token != "."]
#tokens2 = [token for token in tokens2 if token != "."]

# Combine the tokens into a list of unique terms
terms = list(set(tokens1 + tokens2))

# Step 2: Build the inverted index
# Create an empty dictionary to store the inverted index
inverted_index = {}

# For each term, find the documents that contain it
for term in terms:
    documents = []
    if term in tokens1:
        documents.append("Document 1")
    if term in tokens2:
        documents.append("Document 2")
    inverted_index[term] = documents

# Step 3: Print the inverted index
print("Inverted Index:")
for term, documents in inverted_index.items():
    print(term, "->", ", ".join(documents))

# Step 4: Search Query
query = input("\nEnter your search query: ").lower()  # Get the search query from the user
query_terms = word_tokenize(query)  # Tokenize query into individual terms
#query_terms = query.split()  (# Split query into individual terms)

# Find the documents for the query
result_docs = set()  # To store the matching documents

# Iterate over the query terms and retrieve documents
for term in query_terms:
    if term in inverted_index:
        result_docs.update(inverted_index[term])  # Add documents that contain the query term

# Step 5: Display the results
if result_docs:
    print("\nDocuments matching the query:")
    for doc in result_docs:
        print(doc)
else:
    print("\nNo documents found for the query.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Inverted Index:
slept -> Document 2
over -> Document 1
lazy -> Document 1, Document 2
quick -> Document 1
the -> Document 1, Document 2
in -> Document 2
jumped -> Document 1
dog -> Document 1, Document 2
fox -> Document 1
sun -> Document 2
. -> Document 1, Document 2
brown -> Document 1

Enter your search query: lazy dog

Documents matching the query:
Document 1
Document 2
