# A demo for a very simple Search Engine

## Setup

Set this value to 'True' if you want to create your own searches in an interactive manner, otherwise only preset examples are
computed and shown.

In [None]:
interactive = False

Set the inverted index up:

In [None]:
import findspark
findspark.init()
from invertedindex.InvertedIndex import InvertedIndex

inv_idx_wiki = InvertedIndex(html=True)
print('\n\n')
inv_idx_txt = InvertedIndex(html=False)

Setting future displaying of the results up by including their respective links:

In [None]:
from IPython.display import display, Markdown

# display the results with a link to their content
def resolve_results(results):
    for idx, (file_name, rel_path) in enumerate(results, start=1):
        display(Markdown("[{0}]({1})".format(str(idx)+": "+file_name, rel_path+file_name)))

## Preset Examples

### HTML Corpus

Comparison of the retrieval methods on one example:

In [None]:
if not interactive:
    print("Comparing the different retrievals on one example (query=american)...\n")
    print("Vector: ")
    resolve_results(inv_idx_wiki.get_top_ten_vector('american'))
    print("Cosine: ")
    resolve_results(inv_idx_wiki.get_top_ten_cosine('american'))
    print("Boolean: ")
    resolve_results(inv_idx_wiki.get_top_ten_boolean('american'))

Showing some more complex boolean retrievals:

In [None]:
if not interactive:
    print("Some more complex boolean searches...\n")
    print("not american")
    resolve_results(inv_idx_wiki.get_top_ten_boolean('not american'))
    print("(american and airport) or sugar")
    resolve_results(inv_idx_wiki.get_top_ten_boolean('(american and airport) or sugar'))
    print("not ((document or sugar) and american)")
    resolve_results(inv_idx_wiki.get_top_ten_boolean('not ((document or sugar) and american)'))

### TXT Corpus

Comparison of the retrieval methods on one example:

In [None]:
if not interactive:
    print("Comparing the different retrievals on one example (query=sugar)...\n")
    print("Vector: ")
    resolve_results(inv_idx_txt.get_top_ten_vector('sugar'))
    print("Cosine: ")
    resolve_results(inv_idx_txt.get_top_ten_cosine('sugar'))
    print("Boolean: ")
    resolve_results(inv_idx_txt.get_top_ten_boolean('sugar'))

Showing some more complex boolean retrievals:

In [None]:
if not interactive:
    print("Some more complex boolean searches...\n")
    print("not sugar")
    resolve_results(inv_idx_txt.get_top_ten_boolean('not sugar'))
    print("(sugar and airport) or (throughout and january)")
    resolve_results(inv_idx_txt.get_top_ten_boolean('(sugar and airport) or (throughout and january)'))
    print("not (sugar or cargo or hello)")
    resolve_results(inv_idx_txt.get_top_ten_boolean('not (sugar or cargo or hello)'))

## Interactive Mode

In [None]:
import time

def interactive_mode():
    used_corpus = inv_idx_wiki
    used_mode = 'vector'
    while True:
        # might want to adjust time you can take
        time.sleep(5)
        user_input = input('> ')

        if user_input == '--help' or user_input == '--h':
            print("""> --help   or --h to get this help text.
> --corpus or --c to switch corpora.
> --modus  or --m to switch to a different method of retrieval (\"vector\", \"cosine\", or \"boolean\").
> --exit   or --e to exit this here.""")
            continue
        if user_input == '--corpus' or user_input == '--c':
            used_corpus = inv_idx_txt if used_corpus == inv_idx_wiki else inv_idx_wiki
            corpus_response = 'HTML' if used_corpus.html else 'TXT'
            print('> Successfully switched corpus to {0}!'.format(corpus_response))
            continue
        if user_input == '--exit' or user_input == '--e':
            print('> Exiting...')
            break
        if user_input == '--modus' or user_input == '--m':
            print('> Which mode do you want to use: \"vector\", \"cosine\", or \"boolean\"?')
            user_input_mode = input('> Enter vector, cosine, or boolean for your wished mode: ')
            if user_input_mode not in ['vector', 'cosine', 'boolean']:
                print('> Invalid entry...')
            else:
                print('> Successfully switched to {0}!'.format(user_input_mode))
                used_mode = user_input_mode
            continue

        try:
            if not user_input.startswith('--'):
                res = [('err', '')]
                if used_mode == 'vector':
                    res = used_corpus.get_top_ten_vector(user_input)
                if used_mode == 'cosine':
                    res = used_corpus.get_top_ten_cosine(user_input)
                if used_mode == 'boolean':
                    res = used_corpus.get_top_ten_boolean(user_input)

                print('> The results for the query {0} in mode {1} are as followed: '.format(user_input, used_mode))
                resolve_results(res)
                continue
        except Exception:
            print('> Error occurred during retrieval, try again...')
            continue

        print('> Error occurred, try again...')

Making it optional (default=turned off):

In [None]:
if interactive:
    print("Skipped examples... Entering interactive mode!")
    time.sleep(3)
    interactive_mode()