In [7]:
import numpy as np
import pandas as pd

class BooleanRetrieval:
    def __init__(self):
        self.index = {}
        self.documents_matrix = None
        self.doc_ids = []

    def index_document(self, doc_id, text):
        terms = text.lower().split()
        print("Document -", doc_id, terms)
        for term in terms:
            if term not in self.index:
                self.index[term] = set()
            self.index[term].add(doc_id)

    def create_documents_matrix(self, documents):
        self.doc_ids = list(documents.keys())
        terms = list(self.index.keys())
        num_docs = len(documents)
        num_terms = len(terms)

        self.documents_matrix = np.zeros((num_docs, num_terms), dtype=int)

        for i, (doc_id, text) in enumerate(documents.items()):
            doc_terms = text.lower().split()
            for term in doc_terms:
                if term in self.index:
                    term_id = terms.index(term)
                    self.documents_matrix[i, term_id] = 1

    def print_documents_matrix_table(self):
        df = pd.DataFrame(self.documents_matrix, columns=self.index.keys(), index=self.doc_ids)
        print(df)

    def print_all_terms(self):
        print("All terms in the documents:")
        print(list(self.index.keys()))

    def boolean_search(self, query):
        query = query.lower().split()
        result_set = set(self.doc_ids)  # Start with all documents
        operator = None

        for token in query:
            if token in ("and", "or", "not"):
                operator = token
            else:
                # Get documents containing the term
                docs_with_term = self.index.get(token, set())
                if operator is None or operator == "and":
                    result_set = result_set & docs_with_term
                elif operator == "or":
                    result_set = result_set | docs_with_term
                elif operator == "not":
                    result_set = result_set - docs_with_term
                operator = None  # Reset operator after use

        return result_set

In [16]:
# Main program
if __name__ == "__main__":
    indexer = BooleanRetrieval()

    documents = {
        1: "Python is a programming language",
        2: "Information retrieval deals with finding information",
        3: "Boolean models are used in information retrieval"
    }

    for doc_id, text in documents.items():
        indexer.index_document(doc_id, text)

    indexer.create_documents_matrix(documents)
    indexer.print_documents_matrix_table()
    indexer.print_all_terms()

    query = input("Enter your boolean query: ")
    results = indexer.boolean_search(query)
    if results:
        print(f"Results for '{query}': {results}")
    else:
        print("No results found for the query.")

Document - 1 ['python', 'is', 'a', 'programming', 'language']
Document - 2 ['information', 'retrieval', 'deals', 'with', 'finding', 'information']
Document - 3 ['boolean', 'models', 'are', 'used', 'in', 'information', 'retrieval']
   python  is  a  programming  language  information  retrieval  deals  with  \
1       1   1  1            1         1            0          0      0     0   
2       0   0  0            0         0            1          1      1     1   
3       0   0  0            0         0            1          1      0     0   

   finding  boolean  models  are  used  in  
1        0        0       0    0     0   0  
2        1        0       0    0     0   0  
3        0        1       1    1     1   1  
All terms in the documents:
['python', 'is', 'a', 'programming', 'language', 'information', 'retrieval', 'deals', 'with', 'finding', 'boolean', 'models', 'are', 'used', 'in']
Enter your boolean query: not deals
Results for 'not deals': {1, 3}
