### 1. a) Create a **_Term Document Incidence Matrix_** for the given documents

In [2]:
import numpy as np
import pandas as pd
from IPython.display import display, HTML

In [63]:
def termDocMatrix(docs, n):
    terms = []
    
    # splitting each document into seperate terms and appending them to the terms list
    for i in range(n):
        terms.extend(docs[i].split())
    
    # deleting duplicates and sorting the terms list
    terms = sorted(list(set(terms)))

    # a matrix to store 1 if the term is present in the document and 0 if not
    mat = np.zeros((len(terms), n), dtype=int)

    # searching each term in each document and setting appropriate value
    for i in range(len(terms)):
        for j in range(len(docs)):
            if terms[i] in docs[j]:
                mat[i, j] = 1

    # just for printing the matrix as a dataframe. Not important
    data = {"Terms": terms}
    for i in range(n):
        data[f"Doc{i+1}"] = mat[:, i]
    df = pd.DataFrame(data)
    display(HTML(df.to_html(index=False)))
    return (terms, mat)

In [13]:
# reading number of documents
n = int(input("Enter the number of documents: "))

# reading contents of each document and appending to docs list
docs = []
print(f"Enter the contents of {n} documents: ")
for i in range(n):
    docs.append(input(f"\tDocument {i+1}: "))

terms, mat = termDocMatrix(docs, n)

Enter the number of documents:  4


Enter the contents of 4 documents: 


	Document 1:  breakthrough drug for schizophrenia
	Document 2:  new schizophrenia drug
	Document 3:  new approach for treatment of schizophrenia
	Document 4:  new hopes for schizophrenia patients


Terms,Doc1,Doc2,Doc3,Doc4
approach,0,0,1,0
breakthrough,1,0,0,0
drug,1,1,0,0
for,1,0,1,1
hopes,0,0,0,1
new,0,1,1,1
of,0,0,1,0
patients,0,0,0,1
schizophrenia,1,1,1,1
treatment,0,0,1,0


## 1. b) Process boolean queries

In [17]:
'''
This function is used to perform the AND, OR and NOT operations on any two (or one) terms' 
values from the term incidence matrix and return the value
'''
def binOp(vals, op, n):
    ans = []
    if op == 'NOT':
        ans = [1 if vals[0][i]==0 else 0 for i in range(n)]
    elif op == 'AND':
        ans = [vals[0][i] & vals[1][i] for i in range(n)]
    elif op == 'OR':
        ans = [vals[0][i] | vals[1][i] for i in range(n)]
    return ans

In [60]:
'''
This function is used to evaluate a query containing only AND and OR operators.
First we would iterate through each item in the query list.
If the item is an operand (term value from incidence matrix) we would append that to the vals list
If the item is an operator we would set the flag value to 1 which is initially zero, so that next time an
operand occurs we can append it to the vals list and send it for evaluation to the binOp() function and 
store the returned value to the vals list overwriting the previous contents.
Finally we return the evaluated list of values.
'''
def andor(query, n):
    vals = []
    flag = 0
    for i in range(len(query)):
        if query[i] not in ["AND", "OR"]:
            vals.append(query[i])
            if flag == 1:
                vals = binOp(vals, query[i-1], n)
                flag = 0
        else:
            flag = 1
    return vals

In [64]:
'''
This function is used to first process all the sub queries inside paranthesis and then the NOT operations in the query.
Explanation of each step is given in comments.
'''
def processQ(query, n):
    # first we will be evaluating the sub queries in paranthesis, starting from the innermost set of paranthesis to the
    # outermost

    # start index sIdx is used to store the index of the innermost opening paranthesis
    sIdx = 0
    ans = []
    
    # iterate from beginngin to ending of the list
    while sIdx < len(query):
    
        # finding the index of the first occurence of an opening paranthesis
        sIdx = query.index('(') if '(' in query else -1
        
        # if there is no opening paranthesis, we don't need to process this, so we break out of loop
        if sIdx == -1:
            break
        
        # finding the index of the first occurence of an opening paranthesis which would be of the innermost set of paranthesis
        eIdx = query.index(')', sIdx) 
        
        # iterating from the first occurence of the opening paranthesis to the first occurence of the closing paranthesis
        # because the innermost closing paranthesis would appear first
        for i in range(sIdx, eIdx):
            
            # finding the next opening paranthesis from the sIdx if there is one
            idx = query.index('(', sIdx+1) if '(' in query[sIdx+1:] else -1
            
            # if the next opening paranthesis occurs after the first closing paranthesis, it means that the idx
            # is the index of opening paranthesis of another set of paranthesis
            if idx > eIdx: break
            
            # assign the index of the next occurence of opening paranthesis to sIdx if there is one
            sIdx = idx if idx != -1 else sIdx
        
        # now prorcess this inner query for NOT operations and the andor() function will be called for the query in paranthesis
        # by recursively calling this function with the query shortened for the paranthesis
        ans = processQ(query[sIdx+1:eIdx], n)
        
        # next we will delete the query and paranthesis part and replace it with the value returned
        del query[sIdx+1:eIdx+1]
        query[sIdx] = ans
    
    # this part is for evaluating NOT operations
    idx = 0
    while idx < len(query):
        
        # check for NOT operator by iterating from starting to the end of the query list
        idx = query.index("NOT") if "NOT" in query else -1
        
        # if no NOT operators are present break the loop
        if idx == -1:
            break
        
        # if NOT operation is present, send it to the binOp() function with values as the next operand after the NOT operator
        ans = binOp([query[idx+1]], "NOT", n)
        
        # now we will replace the NOT operator and operand with the evaluated list
        del query[idx+1]
        query[idx] = ans
    
    # finally when there are no paranthesis and no NOT operators, we can call the andor() function and return the evaluated answer
    query = andor(query, n)
    return query

In [62]:
n = int(input("Enter the number of documents: "))
docs = []

# reading contents of each document
print(f"Enter the contents of {n} documents: ")
for i in range(n):
    docs.append(input(f"\tDocument {i+1}: "))

# reading the query and splitting it (please note that for queries with paranthesis give a space after a ( and after and before a ))
query = input("Enter the binary query: ").split()

# calling the termDocMatrix() function for getting the terms and the term incidence matrix
terms, mat = termDocMatrix(docs, n)

# iterating through each item in the query list
for i in range(len(query)):

    # if the item is a term replace it with the corresponding value from the mat matrix
    if query[i] in terms:
        query[i] = list(mat[terms.index(query[i]), :])
    
    # ignore if the item is an operator or paranthesis
    elif query[i] in ["(", ")", "NOT", "AND", "OR"]:
        continue
    
    # in neither cases,i.e, the word is not present in the document, replace it with a list of zeros indicating not present
    # in any document
    else:
        query[i] = [0 for i in range(n)]

# evaluating the query by sending it to the processQ() function
ans = processQ(query, 4)

# printing the documents which correspond to the query
print("The request query is processed and the documents are: ")
for i in range(n):
    if ans[i] == 1:
        print(f"Document{i+1}: ", docs[i])

Enter the number of documents:  4


Enter the contents of 4 documents: 


	Document 1:  breakthrough drug for schizophrenia
	Document 2:  new schizophrenia drug
	Document 3:  new approach for treatment of schizophrenia
	Document 4:  new hopes for schizophrenia patients
Enter the binary query:  ( ( schizophrenia AND new ) OR approach ) AND patients


Terms,Doc1,Doc2,Doc3,Doc4
approach,0,0,1,0
breakthrough,1,0,0,0
drug,1,1,0,0
for,1,0,1,1
hopes,0,0,0,1
new,0,1,1,1
of,0,0,1,0
patients,0,0,0,1
schizophrenia,1,1,1,1
treatment,0,0,1,0


The request query is processed and the documents are: 
Document4:  new hopes for schizophrenia patients
