## 2. a) Create an **_Inverted Index_** for the given documents

In [6]:
class InvertedIndex:
    
    def __init__(self):
        self.index = {}
        
    def buildIndex(self):
        for i in range(1, 5):
            with open(f"Data\doc{i}.txt", "r") as f:
                text = f.read()
                text = text.lower()
                text = text.split()
                
                for term in text:
                    if term not in self.index:
                        self.index[term] = [i]
                    else:
                        self.index[term].append(i)
        for k, v in self.index.items():
            self.index[k] = list(set(v))

In [7]:
# defines the operations of stack data structure
class Stack():
    def __init__(self):
        self._stack = []
        
    def push(self, item):
        self._stack.append(item)
        
    def isEmpty(self):
        return not self._stack

    def pop(self):
        if(self.isEmpty()):
            return None
        return self._stack.pop()

    def peek(self):
        if(self.isEmpty()):
            return None
        return self._stack[-1]

    def size(self):
        return len(self._stack)

    def __str__(self):
        toString = ""
        for el in self._stack:
            toString += f"{el} "
        return toString

In [8]:
# infix to postfix converter
class InfixToPostfix:
    def __init__(self, infix):
        self.infix = infix
        self.postfix = []
        self.stack = Stack()
        self.precedence = {"(": 0, "or": 1, "and": 2, "not": 3}
        self.operators = ["and", "or", "not", "(", ")"]

    def convert(self):
        temp = self.infix.split(" ")
        str = ""
        biword = [] #  join words with hyphen to make it a single word
        for t in temp:
            if t not in self.operators:
                str += t + "-"
            else:
                if str != "":
                    biword.append(str[:-1])
                biword.append(t)
                str = ""
        biword.append(str[:-1])
        if biword[-1] == "":
            biword.pop()

        str = ""
        for t in biword:
            str += t + " "
        self.infix = str[:-1]

        tokens = self.infix.split(" ")
        for token in tokens:
            if token not in self.operators:
                self.postfix.append(token)
            elif token == "(":
                self.stack.push(token)
            elif token == ")":
                while self.stack.peek() != "(":
                    self.postfix.append(self.stack.pop())
                self.stack.pop()
            else:
                while not self.stack.isEmpty() and self.precedence[self.stack.peek()] >= self.precedence[token]:
                    self.postfix.append(self.stack.pop())
                self.stack.push(token)
        while not self.stack.isEmpty():
            self.postfix.append(self.stack.pop())
        return self.postfix

## 2. b) Process boolean queries

In [9]:
class Query:
    
    def __init__(self, query):
        self.query = query
        self.query = self.query.replace('(', '( ')
        self.query = self.query.replace(')', ' )')
        self.inv = InvertedIndex()
        self.inv.buildIndex()
        
    def and_operator(self, postings1, postings2):
        p1=0
        p2=0
        result=[]
        while p1<len(postings1) and p2<len(postings2):
            if postings1[p1]==postings2[p2]:
                result.append(postings1[p1])
                p1+=1
                p2+=1
            elif postings1[p1]<postings2[p2]:
                p1+=1
            else:
                p2+=1
        return result

    def or_operator(self, postings1, postings2):
        p1=0
        p2=0
        result=[]
        while p1<len(postings1) and p2<len(postings2):
            if postings1[p1]==postings2[p2]:
                result.append(postings1[p1])
                p1+=1
                p2+=1
            elif postings1[p1]<postings2[p2]:
                result.append(postings1[p1])
                p1+=1
            else:
                result.append(postings2[p2])
                p2+=1

        while p1<len(postings1):
            result.append(postings1[p1])
            p1+=1

        while p2<len(postings2):
            result.append(postings2[p2])
            p2+=1

        return result

    def not_opreator(self, postings):
        return list(set([i for i in range(1, 5)]) - set(postings))

    def solve(self, postings, operator):
        if operator == 'and':
            return self.and_operator(postings[0], postings[1])
        elif operator == 'or':
            return self.or_operator(postings[0], postings[1])
        else:
            return self.not_opreator(postings)
        
    def process_query(self):
        infix = InfixToPostfix(self.query) # convert query to postfix
        postfix = infix.convert()
        boolean_operators = ['and', 'or', 'not']
        query_tokens = []

        # solve postfix boolean expression
        for p in postfix:
            if p not in boolean_operators:
                if p in self.inv.index:
                    query_tokens.append(self.inv.index[p])
                else:
                    query_tokens.append([])
            else:
                if p == 'not':
                    term = query_tokens.pop()
                    operator = 'not'
                    query_tokens.append(self.solve(term, operator))
                else:
                    term = []
                    term.append(query_tokens.pop())
                    term.append(query_tokens.pop())
                    operator = p
                    query_tokens.append(self.solve(term, operator))
        return query_tokens[0]

In [13]:
query = "(established and many) or not lorem"
q = Query(query)
ans = q.process_query()
print(ans)

[1, 4]
