In [12]:
from collections import defaultdict
import os
import re

class BSBI:
    def __init__(self, documents):
        self.documents = documents
        self.index = defaultdict(list)
        self.doc_ids = list(documents.keys())

    def build_index(self):
        for doc_id, text in self.documents.items():
            terms = re.findall(r'\b\w+\b', text.lower())
            for term in set(terms):
                self.index[term].append(doc_id)

    def boolean_search(self, query):
        terms = re.findall(r'\b\w+\b', query.lower())
        result = set(self.doc_ids)

        i = 0
        while i < len(terms):
            term = terms[i]
            if term == 'not':
                i += 1
                term = terms[i]
                result -= set(self.index.get(term, []))
            elif term == 'and':
                i += 1
                term = terms[i]
                result &= set(self.index.get(term, []))
            elif term == 'or':
                i += 1
                term = terms[i]
                result |= set(self.index.get(term, []))
            else:
                result &= set(self.index.get(term, []))
            i += 1

        return result

def read_files_in_folder(folder_path):
    documents = {}
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, "r") as file:
                document_text = file.read()
                documents[filename] = document_text
    return documents

# Specify the folder containing text files
folder_path = "sample dataset 20 newsgroup"

# Read text files in the folder
documents = read_files_in_folder(folder_path)

# Create BSBI instance and build index
bsbi = BSBI(documents)
bsbi.build_index()



In [13]:
# Example queries
query1 = "information and retrieval"
query2 = "cranfield and not retrieval"
query3 = "boolean or dataset"
query4 = "not Marc"

# Perform Boolean search
result1 = bsbi.boolean_search(query1)
result2 = bsbi.boolean_search(query2)
result3 = bsbi.boolean_search(query3)
result4 = bsbi.boolean_search(query4)

# Display results
print(f"Query: {query1}\nResult: {result1}")
print(f"\nQuery: {query2}\nResult: {result2}")
print(f"\nQuery: {query3}\nResult: {result3}")
print(f"\nQuery: {query4}\nResult: {result4}")


Query: information and retrieval
Result: {'sci.crypt.txt', 'comp.sys.ibm.pc.hardware.txt', 'sci.space.txt', 'misc.forsale.txt', 'comp.windows.x.txt', 'sci.med.txt', 'comp.os.ms-windows.misc.txt'}

Query: cranfield and not retrieval
Result: set()

Query: boolean or dataset
Result: {'sci.crypt.txt', 'sci.electronics.txt', 'comp.windows.x.txt', 'sci.med.txt', 'comp.os.ms-windows.misc.txt', 'comp.graphics.txt'}

Query: not Marc
Result: {'talk.religion.misc.txt', 'rec.autos.txt', 'talk.politics.guns.txt', 'alt.atheism.txt'}
