## 2.	Build inverted index and process boolean queries

In [2]:
import os

In [1]:
class InvertedIndex:
    
    def __init__(self):
        self.index = {}
        self.docs = []
        
    def buildIndex(self):
        files = os.listdir('Cranfield Data Set')
        for i in range(1, 21):
            with open(f"Cranfield Data Set\{files[i]}", "r") as f:
                text = f.read()
                text = text.lower()
                self.docs.append(text)
                text = text.split()
                
                for term in text:
                    if term not in self.index:
                        self.index[term] = [i]
                    else:
                        if i not in self.index[term]:
                            self.index[term].append(i)

In [3]:
# defines the operations of stack data structure
class Stack():
    def __init__(self):
        self._stack = []
        
    def push(self, item):
        self._stack.append(item)
        
    def isEmpty(self):
        return not self._stack

    def pop(self):
        if(self.isEmpty()):
            return None
        return self._stack.pop()

    def peek(self):
        if(self.isEmpty()):
            return None
        return self._stack[-1]

    def size(self):
        return len(self._stack)

    def __str__(self):
        toString = ""
        for el in self._stack:
            toString += f"{el} "
        return toString

In [4]:
# infix to postfix converter
class InfixToPostfix:
    def __init__(self, infix):
        self.infix = infix
        self.postfix = []
        self.stack = Stack()
        self.precedence = {"(": 0, "or": 1, "and": 2, "not": 3}
        self.operators = ["and", "or", "not", "(", ")"]

    def convert(self):
        temp = self.infix.split(" ")
        str = ""
        biword = [] #  join words with hyphen to make it a single word
        for t in temp:
            if t not in self.operators:
                str += t + "-"
            else:
                if str != "":
                    biword.append(str[:-1])
                biword.append(t)
                str = ""
        biword.append(str[:-1])
        if biword[-1] == "":
            biword.pop()

        str = ""
        for t in biword:
            str += t + " "
        self.infix = str[:-1]

        tokens = self.infix.split(" ")
        for token in tokens:
            if token not in self.operators:
                self.postfix.append(token)
            elif token == "(":
                self.stack.push(token)
            elif token == ")":
                while self.stack.peek() != "(":
                    self.postfix.append(self.stack.pop())
                self.stack.pop()
            else:
                while not self.stack.isEmpty() and self.precedence[self.stack.peek()] >= self.precedence[token]:
                    self.postfix.append(self.stack.pop())
                self.stack.push(token)
        while not self.stack.isEmpty():
            self.postfix.append(self.stack.pop())
        return self.postfix

In [15]:
class Query:
    
    def __init__(self, query):
        self.query = query
        self.query = self.query.replace('(', '( ')
        self.query = self.query.replace(')', ' )')
        self.inv = InvertedIndex()
        self.inv.buildIndex()
        
    def and_operator(self, postings1, postings2):
        p1=0
        p2=0
        result=[]
        while p1<len(postings1) and p2<len(postings2):
            if postings1[p1]==postings2[p2]:
                result.append(postings1[p1])
                p1+=1
                p2+=1
            elif postings1[p1]<postings2[p2]:
                p1+=1
            else:
                p2+=1
        return result

    def or_operator(self, postings1, postings2):
        p1=0
        p2=0
        result=[]
        while p1<len(postings1) and p2<len(postings2):
            if postings1[p1]==postings2[p2]:
                result.append(postings1[p1])
                p1+=1
                p2+=1
            elif postings1[p1]<postings2[p2]:
                result.append(postings1[p1])
                p1+=1
            else:
                result.append(postings2[p2])
                p2+=1

        while p1<len(postings1):
            result.append(postings1[p1])
            p1+=1

        while p2<len(postings2):
            result.append(postings2[p2])
            p2+=1

        return result

    def not_opreator(self, postings):
        return list(set([i for i in range(1, 21)]) - set(postings))

    def solve(self, postings, operator):
        if operator == 'and':
            return self.and_operator(postings[0], postings[1])
        elif operator == 'or':
            return self.or_operator(postings[0], postings[1])
        else:
            return self.not_opreator(postings)
        
    def process_query(self):
        infix = InfixToPostfix(self.query) # convert query to postfix
        postfix = infix.convert()
        boolean_operators = ['and', 'or', 'not']
        query_tokens = []

        # solve postfix boolean expression
        for p in postfix:
            if p not in boolean_operators:
                if p in self.inv.index:
                    query_tokens.append(self.inv.index[p])
                else:
                    query_tokens.append([])
            else:
                if p == 'not':
                    term = query_tokens.pop()
                    operator = 'not'
                    query_tokens.append(self.solve(term, operator))
                else:
                    term = []
                    term.append(query_tokens.pop())
                    term.append(query_tokens.pop())
                    operator = p
                    query_tokens.append(self.solve(term, operator))
        print("Documents retrieved: ")
        for i in range(len(query_tokens[0])):
            print(self.inv.docs[i])
            print()

In [17]:
query = input("Enter the query: ")
q = Query(query)
ans = q.process_query()
print(ans)

Enter the query:  boundary or (layer and transition)


Documents retrieved: 
in the study of high speed viscous flow past a two dimensional body it is usually necessary to consider a curved shock wave emitting from the nose or leading edge of the body   consequently there exists an inviscid rotational flow region between the shock wave and the boundary layer   such a situation arises for instance in the study of the hypersonic viscous flow past a flat plate   the situation is somewhat different from prandtl's classical boundary layer problem  in prandtl's original problem the inviscid free stream outside the boundary layer is irrotational while in a hypersonic boundary layer problem the inviscid free stream must be considered as rotational   the possible effects of vorticity have been recently discussed by ferri and libby   in the present paper the simple shear flow past a flat plate in a fluid of small viscosity is investigated   it can be shown that this problem can again be treated by the boundary layer approximation the only novel feat