## 2. a) Create an **_Inverted Index_** for the given documents.

In [19]:
# a class to create objects for each term, with the term, frequency of the term in all documents
# an the set of postings in which documents the term is present
class term:
    def __init__(self, s, freq, postings=set()):
        self.s = s
        self.freq = freq
        self.postings = postings

In [20]:
def invertedIndex(docs, n):
    # for each term in the documents convert it into lowercase and note the document number
    # repeated terms occur with different document numbers or same. Refer to the model in textbook
    terms = []
    for i in range(n):
        terms += [[j.lower(), i+1] for j in docs[i].split()]
    
    # sort the terms list in lexicographic order
    terms = sorted(terms)

    # a list for storing the objects of each term
    termPostings = []

    # iterating through terms list
    i = 0
    while i < len(terms):
        
        # postings list stores the document numbers where the term occurs
        postings = [terms[i][1]]

        # freq stores the number of times the term has occured in all the documents
        freq = 1

        # now we iterate the terms list from i+1 to the end of the list and check for matching terms
        j = i + 1

        # we used the comparison condition within the while statement instead of a seperate if statement inside the loop
        # because the terms are sorted and matching terms would be adjacent to each other
        while j < len(terms) and terms[i][0] == terms[j][0]:

            # append the document number of matching term to the postings list
            postings.append(terms[j][1])
            
            # incrementing the frequency of the term
            freq += 1
            j += 1

        # creating and appending a term object to the termPostings list
        termPostings.append(term(terms[i][0], freq, set(postings)))

        # now start the outer loop from the next term as we have iterated the similar terms
        i = j

    # removing the document numbers and assigning only the terms and removing duplicates and sorting
    for i in range(len(terms)):
        terms[i] = terms[i][0]
    terms = sorted(list(set(terms)))

    # printing the inverted index 
    for i in termPostings:
        print(i.s, i.freq, i.postings)
    return (terms, termPostings)

In [14]:
n = int(input("Enter the number of documents: "))
docs = []
print("Enter the contents of each document: ")
for i in range(n):
    docs.append(input(f"\tDocument {i+1}: "))
terms, termPostings = invertedIndex(docs, n)

Enter the number of documents:  4


Enter the contents of each document: 


	Document 1:  breakthrough drug for schizophrenia
	Document 2:  new schizophrenia drug
	Document 3:  new approach for treatment of schizophrenia
	Document 4:  new hopes for schizophrenia patients


approach 1 {3}
breakthrough 1 {1}
drug 2 {1, 2}
for 3 {1, 3, 4}
hopes 1 {4}
new 3 {2, 3, 4}
of 1 {3}
patients 1 {4}
schizophrenia 4 {1, 2, 3, 4}
treatment 1 {3}


--- 

## 2. b) Process boolean queries

In [21]:
'''
This function is used to perform the AND, OR and NOT operations on any two (or one) term posting sets
and returns the evaluated value
'''

def binOp(termPostings, op, n):
    if op == 'NOT':
        return termPostings[0] - set(range(1, n+1))
    elif op == 'AND':
        return termPostings[0].intersection(termPostings[1])
    elif op == 'OR':
        return termPostings[0].union(termPostings[1])

In [22]:
'''
This function is used to evaluate a query containing only AND and OR operators.
First we would iterate through each item in the query list.
If the item is an operand (term value from incidence matrix) we would append that to the vals list
If the item is an operator we would set the flag value to 1 which is initially zero, so that next time an
operand occurs we can append it to the vals list and send it for evaluation to the binOp() function and 
store the returned value to the vals list overwriting the previous contents.
Finally we return the evaluated list of values.
'''
def andor(query, n):
    vals = []
    flag = 0
    for i in range(len(query)):
        if query[i] not in ["AND", "OR"]:
            vals.append(query[i])
            if flag == 1:
                vals = binOp(vals, query[i-1], n)
                flag = 0
        else:
            flag = 1
    return vals

In [23]:
'''
This function is used to first process all the sub queries inside paranthesis and then the NOT operations in the query.
Explanation of each step is given in comments.
'''
def processQ(query, n):
    # first we will be evaluating the sub queries in paranthesis, starting from the innermost set of paranthesis to the
    # outermost

    # start index sIdx is used to store the index of the innermost opening paranthesis
    sIdx = 0
    ans = []
    
    # iterate from beginngin to ending of the list
    while sIdx < len(query):
    
        # finding the index of the first occurence of an opening paranthesis
        sIdx = query.index('(') if '(' in query else -1
        
        # if there is no opening paranthesis, we don't need to process this, so we break out of loop
        if sIdx == -1:
            break
        
        # finding the index of the first occurence of an opening paranthesis which would be of the innermost set of paranthesis
        eIdx = query.index(')', sIdx) 
        
        # iterating from the first occurence of the opening paranthesis to the first occurence of the closing paranthesis
        # because the innermost closing paranthesis would appear first
        for i in range(sIdx, eIdx):
            
            # finding the next opening paranthesis from the sIdx if there is one
            idx = query.index('(', sIdx+1) if '(' in query[sIdx+1:] else -1
            
            # if the next opening paranthesis occurs after the first closing paranthesis, it means that the idx
            # is the index of opening paranthesis of another set of paranthesis
            if idx > eIdx: break
            
            # assign the index of the next occurence of opening paranthesis to sIdx if there is one
            sIdx = idx if idx != -1 else sIdx
        
        # now prorcess this inner query for NOT operations and the andor() function will be called for the query in paranthesis
        # by recursively calling this function with the query shortened for the paranthesis
        ans = processQ(query[sIdx+1:eIdx], n)
        
        # next we will delete the query and paranthesis part and replace it with the value returned
        del query[sIdx+1:eIdx+1]
        query[sIdx] = ans
    
    # this part is for evaluating NOT operations
    idx = 0
    while idx < len(query):
        
        # check for NOT operator by iterating from starting to the end of the query list
        idx = query.index("NOT") if "NOT" in query else -1
        
        # if no NOT operators are present break the loop
        if idx == -1:
            break
        
        # if NOT operation is present, send it to the binOp() function with values as the next operand after the NOT operator
        ans = binOp([query[idx+1]], "NOT", n)
        
        # now we will replace the NOT operator and operand with the evaluated list
        del query[idx+1]
        query[idx] = ans
    
    # finally when there are no paranthesis and no NOT operators, we can call the andor() function and return the evaluated answer
    query = andor(query, n)
    return query

In [18]:
n = int(input("Enter number of documents: "))
docs = []
print("Enter the contents of each document: ")
for i in range(n):
    docs.append(input(f"\tDocument {i+1}: "))
query = input("Enter the boolean query:").split()
terms, termPostings = invertedIndex(docs, n)
for i in range(len(query)):
    if query[i] in terms:
        query[i] = termPostings[terms.index(query[i])].postings
    elif query[i] in ["(", ")", "NOT", "AND", "OR"]:
        continue
    else:
        query[i] = set()

ans = processQ(query, 4)
print("The request query is processed and the documents are: ")
for i in ans:
    print(f"Document{i}: ", docs[i-1])

Enter number of documents:  4


Enter the contents of each document: 


	Document 1:  breakthrough drug for schizophrenia
	Document 2:  new schizophrenia drug
	Document 3:  new approach for treatment of schizophrenia
	Document 4:  new hopes for schizophrenia patients
Enter the boolean query: asdf AND new


approach 1 {3}
breakthrough 1 {1}
drug 2 {1, 2}
for 3 {1, 3, 4}
hopes 1 {4}
new 3 {2, 3, 4}
of 1 {3}
patients 1 {4}
schizophrenia 4 {1, 2, 3, 4}
treatment 1 {3}
The request query is processed and the documents are: 
