# Examples for the Boolean Model
## Creating our example corpus and queries

In [1]:
import pandas as pd

Some example documents:

In [2]:
document_1_raw = "This is an exercise worth looking forward to."
document_2_raw = "Is this what I think it is?"
document_3_raw = "A showcase for the Boolean Model!"

Get a set of words from the raw string by:
- Setting every word to lower case
- Removing punctuations
- Taking every single word (in this case separated by one space)

In [3]:
document_1_set = set(document_1_raw.lower().replace(".", "").split(" "))
document_2_set = set(document_2_raw.lower().replace("?", "").split(" "))
document_3_set = set(document_3_raw.lower().replace("!", "").split(" "))
corpus = [document_1_set, document_2_set, document_3_set]

words_max = max([len(document) for document in corpus])
df_corpus = pd.DataFrame(corpus, ['document_' + str(i+1) for i in range(len(corpus))], ["word_" + str(i+1) for i in range(words_max)])
df_corpus

Unnamed: 0,word_1,word_2,word_3,word_4,word_5,word_6,word_7,word_8
document_1,this,exercise,an,to,worth,forward,looking,is
document_2,this,it,think,what,i,is,,
document_3,for,boolean,model,a,the,showcase,,


Some example queries:

In [4]:
# strip everything down to the single words within a query

query_1 = "boolean AND model"
query_1_set = ' '.join(query_1.replace("AND", "").split()).split(" ")

query_2 = "(showcase OR what) AND is"
query_2_set = ' '.join(query_2.replace("AND", "").replace("OR", "").replace("(", "").replace(")", "").split()).split(" ")

query_3 = "NOT boolean"
query_3_set = ' '.join(query_3.replace("NOT", "").split()).split(" ")

query_4 = "this AND is AND (worth OR think) AND NOT it"
query_4_set = ' '.join(query_4.replace("AND", "").replace("OR", "").replace("(", "").replace(")", "").replace("NOT", "").split()).split(" ")

queries = [query_1_set, query_2_set, query_3_set, query_4_set]

words_max = max([len(query) for query in queries])
df_queries = pd.DataFrame(queries, ['query_' + str(i+1) for i in range(len(queries))], ["word_" + str(i+1) for i in range(words_max)])
df_queries

Unnamed: 0,word_1,word_2,word_3,word_4,word_5
query_1,boolean,model,,,
query_2,showcase,what,is,,
query_3,boolean,,,,
query_4,this,is,worth,think,it


## Evaluating queries on the documents
Evaluation functions:

In [5]:
# generic evaluation according to a function
def evaluate_query(eval_function, values):
    return eval_function(values)

# different evaluations for the different queries
def eval_query_1(values):
    return all(value == True for value in values)

def eval_query_2(values):
    # (showcase OR what)
    expr_1 = values[0] or values[1]
    # .. AND is
    expr_2 = expr_1 and values[2]

    if not expr_2: return False
    return True

def eval_query_3(values):
    return all(value == False for value in values)

def eval_query_4(values):
    # this AND is
    expr_1 = values[0] and values[1]
    # .. AND (worth OR think)
    expr_2 = expr_1 and (values[2] or values[3])
    # .. AND NOT it
    expr_3 = expr_2 and (not values[4])

    if not expr_3: return False
    return True

Create the results according to the query with its respective evaluation function.

In [6]:
def create_results(query_set, eval_function):
    # evaluate if a word (of the query) is in a document
    query_results = {word : [word in document for document in corpus] for word in query_set}
    # add the complete result for a document by evaluating accordingly
    query_results = query_results | \
                      {"total" : [evaluate_query(eval_function, [query_results.get(key)[i] for key in query_results.keys()]) for i in range(len(corpus))]}

    df = pd.DataFrame(query_results, ['document_' + str(i+1) for i in range(len(corpus))])
    return df

Query 1: "boolean AND model"

In [7]:
df_1 = create_results(query_1_set, eval_query_1)
df_1

Unnamed: 0,boolean,model,total
document_1,False,False,False
document_2,False,False,False
document_3,True,True,True


Query 2: "(showcase OR what) AND is"

In [8]:
df_2 = create_results(query_2_set, eval_query_2)
df_2

Unnamed: 0,showcase,what,is,total
document_1,False,False,True,False
document_2,False,True,True,True
document_3,True,False,False,False


Query 3: "NOT boolean"

In [9]:
df_3 = create_results(query_3_set, eval_query_3)
df_3

Unnamed: 0,boolean,total
document_1,False,True
document_2,False,True
document_3,True,False


Query 4: "this AND is AND (worth OR think) AND NOT it"

In [10]:
df_4 = create_results(query_4_set, eval_query_4)
df_4

Unnamed: 0,this,is,worth,think,it,total
document_1,True,True,True,False,False,True
document_2,True,True,False,True,True,False
document_3,False,False,False,False,False,False
