In [10]:
from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/dj-sha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
class InvertedIndex:
    def __init__(self):
        self.index = defaultdict(list)
        self.stemmer = PorterStemmer()
    
    def add_document(self, doc_id, text):
        # Tokenize and normalize to lowercase
        words = word_tokenize(text.lower())
        # Apply stemming
        stemmed_words = [self.stemmer.stem(word) for word in words if word.isalpha()]
        unique_words = set(stemmed_words)  # Use unique words to avoid duplicate entries in index
        for word in unique_words:
            self.index[word].append(doc_id)
    
    def search(self, query):
        query_words = word_tokenize(query.lower())
        stemmed_query = [self.stemmer.stem(word) for word in query_words if word.isalpha()]
        result_sets = []
        
        for word in stemmed_query:
            if word in self.index:
                result_sets.append(set(self.index[word]))
            else:
                result_sets.append(set())
        
        if not result_sets:
            return set()
        
        # Return the intersection of results for all query words
        return set.intersection(*result_sets)
    
    def print_index(self):
        print("Inverted Index (Stemmed Words -> Document IDs):")
        for word, doc_ids in self.index.items():
            print(f"{word}: {doc_ids}")


In [16]:
def main():
    documents = {
    1: "The quick brown fox jumps over the lazy dog",
    2: "Never jump over the lazy dog quickly",
    3: "The quick brown fox is quick and very quick",
    4: "A fast brown dog outpaces a quick fox",
    5: "Lazy dogs are not a match for quick foxes",
    6: "Foxes are generally quicker than dogs",
    7: "In the race between a fox and a dog, the fox wins quickly"
}

    
    inverted_index = InvertedIndex()
    
    # Build the inverted index
    for doc_id, text in documents.items():
        inverted_index.add_document(doc_id, text)
    
    # Query the inverted index
    query = "fox dog"
    result = inverted_index.search(query)
    
    if result:
        print(f"Documents matching '{query}': {result}")
    else:
        print(f"No documents found for query: {query}")  

if __name__ == "__main__":
    main()

Documents matching 'fox dog': {1, 4, 5, 6, 7}
Inverted Index (Stemmed Words -> Document IDs):
dog: [1, 2, 4, 5, 6, 7]
fox: [1, 3, 4, 5, 6, 7]
jump: [1, 2]
quick: [1, 3, 4, 5]
over: [1, 2]
lazi: [1, 2, 5]
brown: [1, 3, 4]
the: [1, 2, 3, 7]
never: [2]
quickli: [2, 7]
is: [3]
and: [3, 7]
veri: [3]
a: [4, 5, 7]
fast: [4]
outpac: [4]
match: [5]
not: [5]
for: [5]
are: [5, 6]
quicker: [6]
than: [6]
gener: [6]
win: [7]
between: [7]
in: [7]
race: [7]
