# Biword Index

In [1]:
# defines the operations of stack data structure
class Stack():
    def __init__(self):
        self._stack = []
        
    def push(self, item):
        self._stack.append(item)
        
    def isEmpty(self):
        return not self._stack

    def pop(self):
        if(self.isEmpty()):
            return None
        return self._stack.pop()

    def peek(self):
        if(self.isEmpty()):
            return None
        return self._stack[-1]

    def size(self):
        return len(self._stack)

    def __str__(self):
        toString = ""
        for el in self._stack:
            toString += f"{el} "
        return toString

In [2]:
# infix to postfix converter
class InfixToPostfix:
    def __init__(self, infix):
        self.infix = infix
        self.postfix = []
        self.stack = Stack()
        self.precedence = {"(": 0, "or": 1, "and": 2, "not": 3}
        self.operators = ["and", "or", "not", "(", ")"]

    def convert(self):
        temp = self.infix.split(" ")
        str = ""
        biword = [] #  join words with hyphen to make it a single word
        for t in temp:
            if t not in self.operators:
                str += t + "-"
            else:
                if str != "":
                    biword.append(str[:-1])
                biword.append(t)
                str = ""
        biword.append(str[:-1])
        if biword[-1] == "":
            biword.pop()

        str = ""
        for t in biword:
            str += t + " "
        self.infix = str[:-1]

        tokens = self.infix.split(" ")
        for token in tokens:
            if token not in self.operators:
                self.postfix.append(token)
            elif token == "(":
                self.stack.push(token)
            elif token == ")":
                while self.stack.peek() != "(":
                    self.postfix.append(self.stack.pop())
                self.stack.pop()
            else:
                while not self.stack.isEmpty() and self.precedence[self.stack.peek()] >= self.precedence[token]:
                    self.postfix.append(self.stack.pop())
                self.stack.push(token)
        while not self.stack.isEmpty():
            self.postfix.append(self.stack.pop())
        return self.postfix

In [3]:
class Biword:
    
    def __init__(self):
        self.index = {}
        
    def buildIndex(self):
        for i in range(1, 5):
            with open(f"doc{i}.txt", "r") as f:
                text = f.read()
                text = text.split()
                for j in range(len(text)-1):
                    s = text[j] + ' ' + text[j + 1]
                    if s in self.index:
                        self.index[s].append(i)
                    else:
                        self.index[s] = [i]
        for k, v in self.index.items():
            self.index[k] = list(set(v))
    

In [4]:
class Query:
    
    def __init__(self, query):
        self.query = query
        self.query = self.query.replace('(', '( ')
        self.query = self.query.replace(')', ' )')
        self.bwd = Biword()
        self.bwd.buildIndex()
        
    def and_operator(self, postings1, postings2):
        p1=0
        p2=0
        result=[]
        while p1<len(postings1) and p2<len(postings2):
            if postings1[p1]==postings2[p2]:
                result.append(postings1[p1])
                p1+=1
                p2+=1
            elif postings1[p1]<postings2[p2]:
                p1+=1
            else:
                p2+=1
        return result
    def processBiword(self):
        if self.query == "":
            return []
        infix = InfixToPostfix(self.query)
        postfix = infix.convert()
        q = postfix[0].split('-')
        st = ""
        biword_tokens = []
        for i in range(len(q)-1):
            st += q[i]
            st += ' '
            st += q[i+1]
            if st in self.bwd.index:
                biword_tokens.append(self.bwd.index[st])
            else:
                biword_tokens.append([])
            st = ""
        i = 0
        while i < len(biword_tokens) - 1:
            terms = [biword_tokens[i], biword_tokens[i+1]]
            del biword_tokens[i]
            biword_tokens[i] = self.and_operator(terms[0], terms[1])
            if len(biword_tokens) == 1:
                break
        return biword_tokens[0]

In [5]:
query = "have suffered alteration"
q = Query(query)
ans = q.processBiword()
print(ans)

[1, 3]
