# Query

------

Running all previous files to process queries

In [11]:
%run InfixtoPostfix.ipynb
%run InvertedIndex.ipynb
%run PositionalIndex.ipynb
%run BiwordIndex.ipynb

-----

### A Query class that accepts a query and processes it

In [2]:
class Query:
    
    def __init__(self, query):
        self.query = query

        # adding single space to opening and closing paranthesis 
        self.query = self.query.replace('(', '( ')
        self.query = self.query.replace(')', ' )')

        # gettings inverted index and biword index dictionaries
        self.inv = InvertedIndex()
        self.bwd = Biword()
        self.pos = PositionalIndex()
        self.inv.buildIndex()
        self.bwd.buildIndex()
        self.pos.buildIndex()

    # performing and operation on two posting lists
    def and_operator(self, postings1, postings2):
        p1=0
        p2=0
        result=[]
        while p1<len(postings1) and p2<len(postings2):
            if postings1[p1]==postings2[p2]:
                result.append(postings1[p1])
                p1+=1
                p2+=1
            elif postings1[p1]<postings2[p2]:
                p1+=1
            else:
                p2+=1
        return result

    # performing or operation on two postings list
    def or_operator(self, postings1, postings2):
        p1=0
        p2=0
        result=[]
        while p1<len(postings1) and p2<len(postings2):
            if postings1[p1]==postings2[p2]:
                result.append(postings1[p1])
                p1+=1
                p2+=1
            elif postings1[p1]<postings2[p2]:
                result.append(postings1[p1])
                p1+=1
            else:
                result.append(postings2[p2])
                p2+=1

        while p1<len(postings1):
            result.append(postings1[p1])
            p1+=1

        while p2<len(postings2):
            result.append(postings2[p2])
            p2+=1

        return result

    # performing not operations on two postings list
    def not_opreator(self, postings):
        return list(set([i for i in range(1, 5)]) - set(postings))

    # a function that categorizes the operation to be performed 
    def solve(self, postings, operator):
        if operator == 'and':
            return self.and_operator(postings[0], postings[1])
        elif operator == 'or':
            return self.or_operator(postings[0], postings[1])
        else:
            return self.not_opreator(postings)
    
    def processBiword(self):
        infix = InfixToPostfix(self.query)
        postfix = infix.convert()
        q = postfix[0].split('-')
        biword_tokens = []
        st = ""
        for i in range(len(q)-1):
            st += q[i] + ' ' + q[i+1]
            if st in self.bwd.index:
                biword_tokens.append(self.bwd.index[st])
            else:
                biword_tokens.append([])
            st = ''
        for i in range(len(biword_tokens)-1):
            ans = self.and_operator(biword_tokens[i], biword_tokens[i+1])
            del biword_tokens[i+1]
            biword_tokens[i] = ans
        return biword_tokens[0]
    
    
    def processPositional(self):
        queryPos = self.query.split()
        if queryPos[0] in self.pos.index:
            first = self.pos.index[queryPos[0]]
        else:
            return []
        ans = []
        for docId, vals in first.items():
            count = 0
            for i in range(1, len(queryPos)):
                if docId not in self.pos.index[queryPos[i]]:
                   break
                postings1 = vals[1]
                postings2 = self.pos.index[queryPos[i]][docId][1]
                for x in postings1:
                    for y in postings2:
                        if y < x:
                            break
                        elif abs(x - y) == i:
                            count += 1
            if count == len(queryPos)-1:
                ans.append(docId)
        return ans
    
    # the main query processing step
    def process_query(self):
        infix = InfixToPostfix(self.query) # convert query to postfix
        postfix = infix.convert()
        boolean_operators = ['and', 'or', 'not']
        query_tokens = []
        # solve postfix boolean expression
        for p in postfix:

            # if the item is not an operator
            if p not in boolean_operators:
                q = p.split('-') # splitting in case of biword
                if len(q)==1: # in case of single word, search for its index in inverted index and append to query_tokens
                    if q[0] in self.inv.index:
                        query_tokens.append(self.inv.index[q[0]])
                    else:
                        query_tokens.append([])
                else:
                    biword_tokens = []
                    # if it is a biword, then create the biword and search in the biword index and append to query_tokens
                    st = ""
                    for i in range(len(q)-1):
                        st += q[i]
                        st += ' '
                        st += q[i+1]
                        if st in self.bwd.index:
                            biword_tokens.append(self.bwd.index[st])
                        else:
                            biword_tokens.append([])
                        st = ""
                    for i in range(len(biword_tokens)-1):
                        ans = self.and_operator(biword_tokens[i], biword_tokens[i+1])
                        del biword_tokens[i+1]
                        biword_tokens[i] = ans
                    query_tokens.append(biword_tokens[0])
            else: # if the item is an operator
                if p == 'not':
                    term = query_tokens.pop()
                    operator = 'not'
                    query_tokens.append(self.solve(term, operator))
                else:
                    term = []
                    term.append(query_tokens.pop())
                    term.append(query_tokens.pop())
                    operator = p
                    query_tokens.append(self.solve(term, operator))
        return query_tokens[0]

In [3]:
def preprocess(query):
    newQuery = []
    query = query.split()
    for i in range(len(query) - 1):
        st = query[i] + ' ' + query[i+1]
        newQuery.append(st)
        newQuery.append('and')
    newQuery.pop()
    return ' '.join(newQuery)