# Libary Imports and Downloads

In [1]:
import re
import nltk
import os
import json
import warnings
import time

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

from IPython.display import clear_output
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize,RegexpTokenizer
from nltk.stem import WordNetLemmatizer

stops = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\korni\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\korni\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\korni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\korni\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Tokenization and File Reading functions

In [2]:
#reading in the file in try block, throws exception if file is not found 
def read_file(file):
    try:    
        with open(file, "r", encoding= "utf-8") as html_file:
            html_contents = html_file.read()
            html_file.close()
            return html_contents

    except FileNotFoundError as err:
        print(f"FileNotFoundError successfully handled\n" f"{err}")
        pass

In [3]:
# Creates bigrams from tokens, if bigram occurs more than 5 times, joins the two tokens by "-" and adds it to the list 
def add_bigrams(text):
    marvel_bigrams = nltk.bigrams(text)
    frequency_bigrams = nltk.FreqDist(marvel_bigrams)
    
    bigrams_list = []
    for key, value in frequency_bigrams.items():
        if value > 5:
            bigrams_list.append('-'.join(key))
    
    return(bigrams_list)


def tokenize(text, file_type = 'html'):
    cleaned_tokens = []
    if file_type == 'html':
        soup = BeautifulSoup(text, 'html.parser')
        tags = soup.find_all(['p','span','h1','h2','h3','h4','h5'])
        for t in tags:
            #convert everything to lowercase
            low_tags = t.get_text().lower()
            #remove everything between square brackets
            square_regex_tags = re.sub(r'\[.*?\]', '', str(low_tags))
            #remove everything that isnt a lower case letter or number or dash
            regex_tags = re.sub(r'[^a-z\d\s-]', u"", square_regex_tags)
            #lemmatize
            lemmatize_tags = lemmatizer.lemmatize(regex_tags)
            #tokenize
            text_tokens = word_tokenize(lemmatize_tags)
            #remove stopwords
            cleaned_tokens += [t for t in text_tokens if t not in stops]
        #get bigrams
        cleaned_tokens += add_bigrams(cleaned_tokens)
    else:
        cleaned_tokens = word_tokenize(text)
          
    return cleaned_tokens

In [4]:
def iterate_over_files(directory):
    it_list = []
    i = -1
    docIDs = {}
    with os.scandir(directory) as element:
        for entry in element:
            if entry.name.endswith(".html") and entry.is_file():
                i += 1
                docIDs[i] = entry.name
                it_list.append(tokenize(read_file(entry.path), 'html'))
                
    with open ("docids.json", "w") as docids_file:
        json.dump(docIDs,docids_file, indent = 2)
    docids_file.close()
        
    return it_list, docIDs

# Index functions

In [5]:
def index_builder(text):
    postings = {}
    vocab = []
    vocabdict = {}
    i = -1
    # outter loop is used to iterate through the files in the directory
    for z, noLists in enumerate(text):
    # inner loop is used to iterate through the words in each file    
        for count, item in enumerate(text[z]):
            if item not in vocab:
                i+=1
                vocab.append(item)
                
            vocabID = vocab.index(item)
            
            if vocabID not in postings:
                vocabdict[item] = i
                postings[vocabID] = [z]
            else:
                if z not in postings[vocabID]:
                    postings[vocabID].append(z)
            
    with open("postings.json", "w") as postings_file:
        json.dump(postings,postings_file, indent = 2)
    postings_file.close()
    
    with open("vocab.json", "w") as vocab_file:
        json.dump(vocabdict,vocab_file, indent = 2)
    vocab_file.close()
    
    return vocabdict, postings

In [6]:
def build(directory = ''):
    if directory == '':
        while True:
            directory = input("please enter a directory name ")
            if os.path.exists(directory):
                break
            print(directory + " is not a valid directory")
    
    startTokens = time.time()
    print("Iterating over all files in folder " + directory + " and extracting tokens")
    vocabTokens, docIDs = iterate_over_files(directory)
    print("token extraction complete in " + str(time.time() - startTokens) + " seconds")
    startIndex = time.time()
    print("Building index")
    vocab, postings = index_builder(vocabTokens)
    print("index built in " + str(time.time() - startIndex) + " seconds")
    
    return vocab, postings, docIDs

# Query functions

In [7]:
#Try and create a word if current word isnt in vocab - example: ironman > iron-man
def error_checker(word, vocab):
    if word not in vocab:
        warnings.warn("Warning " + word + " not found, attempting to find a similar word")
        try:
            for letter in range(len(word)):
                new_word = word
                new_word = word[:letter] + '-' + word[letter:]
                if new_word in vocab:
                    print('\n' + word + ' not found, instead searching for "' + new_word.replace('-', ' ') + '"')
                    return new_word
            raise KeyError
        except KeyError as err1:
            print(word + f" not found. Closing program:\n" f"{err1}")
            pass
    else:
        return(word)

In [8]:
#BOOLEAN RETRIEVAL operations

def and_operation(posting1,posting2):
    answer = []
    for posting in posting1:
        if posting in posting2:
            answer.append(posting)     
    return answer

#set used here as set cannot contain duplicate values
def or_operation(posting1,posting2):
    answer = posting1 + posting2  
    return list(set(answer))

def not_operation(posting1, posting2):
    answer = []
    for posting in posting1:
        if posting not in posting2:
            answer.append(posting)
    return answer

In [9]:
#splits query into sections with AND, OR and NOT as the delimiter

def query_splitter(query):
    clean_query = []
    current_word = ''
    for count, word in enumerate(query):
        if word == 'not' or word == 'and' or word == 'or':
            clean_query.append(current_word.rstrip(current_word[-1]))
            current_word = ''
            clean_query.append(word)
        else:
            current_word += word + '-'
    
    clean_query.append(current_word.rstrip(current_word[-1]))
    
    return clean_query

In [10]:
## Boolean AND, OR, NOT query

def query_wizard(query, vocab, postings, docIDs):
    query_result_docs = []
    
    query_list = query_splitter(tokenize(query.lower(), ''))
    query_list_length = len(query_list)
    
    vocabID = vocab[error_checker(query_list[0], vocab)]
    query_result = postings[vocabID]

    for index, word in enumerate(query_list):
        if word == "and" or word == "or" or word =="not" and query_list_length > index + 1:
            query_result_update = []
            term_2 = query_list[index + 1]
            if term_2 not in vocab: term_2 = error_checker(term_2, vocab)
            if term_2 in vocab: posting1 = postings[vocab[term_2]]
            else: raise Exception(term_2 + " does not exist, program quitting")
            posting = query_result
            if word == "and":
                 results = and_operation(posting1,posting)
            elif word == "or":
                results = or_operation(posting1,posting)
            elif word == "not": 
                results = not_operation(posting, posting1)
                    
            for result in results:
                query_result_update.append(result)

            query_result = query_result_update

    for result in query_result:
        query_result_docs.append(docIDs[result])
    
    return query_result_docs

In [11]:
def main(vocab, postings, docIDs):
    while True:
        query = input("""What would you like to query? (type quit to exit, type forceList to print as a List)""")
        start = time.time()
        clear_output(wait=True)
        print(query)
        if query == "quit": break
        if query == "forceList":
            query = input("What would you like to query with a printed list? ")
            print(query_wizard(query, vocab, postings, docIDs))
            total_time = time.time() - start
        else:
            URLS = query_wizard(query, vocab, postings, docIDs)
            total_time = time.time() - start
            for url in URLS:
                print(url)

        print("time elapsed: " + str(total_time) + " seconds")

# Main Program - Run in order

In [12]:
# Global variables, saves rebuilding the tables every time the program runs
# If you would rather not type out the path in the input() field you can enter it directly into build('/path/goes here')
vocab, postings, docIDs = build("marvelWiki")

Iterating over all files in folder marvelWiki and extracting tokens
token extraction complete in 93.35526299476624 seconds
Building index
index built in 135.78088688850403 seconds


In [16]:
# The complete search engine, enter anything and as long as the words exist in the vocab table it will run
# if it doesnt exist, it will try and find a similar word (see error_checker) and if that doesn't exist it will
# throw an exception
try:
    main(vocab, postings, docIDs)
except KeyError:
    print("Exiting due to keyError")



Ironman AND Thor OR Shang Chi NOT Hawkeye

ironman not found, instead searching for "iron man"
Aleta_Ogord_(Earth-691).html
Anthony_Druid_(Earth-616).html
Anthony_Stark_(Earth-96020).html
Brian_Braddock_(Earth-616).html
Captain_Universe_(Earth-616).html
Charlie-27_(Earth-691).html
Crystalia_Amaquelin_(Earth-616).html
Dane_Whitman_(Earth-616).html
Delroy_Garrett_Jr._(Earth-616).html
Dennis_Dunphy_(Earth-616).html
Dennis_Sykes_(Earth-616).html
Eden_Fesi_(Earth-616).html
Elvin_Haliday_(Earth-616).html
Eric_Brooks_(Earth-616).html
Eric_Masterson_(Earth-616).html
Eros_(Earth-616).html
Eugene_Thompson_(Earth-616).html
Genis-Vell_(Earth-98120).html
Gilgamesh_(Earth-616).html
Giuletta_Nefaria_(3rd_Bio-Duplicate)_(Earth-616).html
Heather_Douglas_(Earth-616).html
Imperial_Guard_(Earth-616).html
Isabel_Kane_(Earth-616).html
James_Howlett_(Earth-616).html
Jane_Foster_(Earth-616).html
Jocasta_Pym_(Earth-616).html
Jonathan_Hart_(Earth-616).html
Julia_Carpenter_(Earth-616).html
Kelsey_Leigh_(Earth-61

KeyboardInterrupt: Interrupted by user

# Tests - test to see if everything is working correctly

In [18]:
#find the average length of time to run a query over a large number of times (defined by length)    
def test_iteration_counter(query, vocab, postings, docIDs):
    length = 5000
    total_time1 = 0
    for i in range(length):
        start = time.time()
        query_wizard(query, vocab, postings, docIDs)
        total_time = time.time() - start
        total_time1 += total_time
    print(total_time1/length)
    
test_iteration_counter("ironman", vocab, postings, docIDs)




ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iro

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron


ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iro

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron


ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iro


ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iro

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron man"

ironman not found, instead searching for "iron

## WARNING do not run after main program, as it will overwrite the JSON files containing vocab, postings and docIDs
To run this cell you must type the letter y in the text box

In [17]:
if input("run tests? (y/n)") == 'y':
    #________________________________ read_file test
    file_test = read_file("marvelWiki/Aleta_Ogord_(Earth-691).html")
    print("read_file functioning: " + str(bool(BeautifulSoup(file_test, "html.parser").find())))

    #________________________________ tokenize test
    tokenize_test = tokenize(file_test)
    print("tokenize functioning: " + str(bool("aleta" in tokenize_test)))

    #________________________________ bigrams test
    bigrams_test = ['j','l','j','j','l','j','j','l','j','j','l','j','j','l','j','j','l','j','j','l','j',]
    print("add_bigrams functioning: " + str(bool("j-j" in add_bigrams(bigrams_test))))

    #________________________________ iterate_over_files test
    try:
        os.mkdir('marvelWiki2')
    except OSError as error:
        pass

    for i in range(3):
        with open('marvelWiki2/htmlTest' + str(i) + '.html', 'w') as f:
            f.write('<p>hel-lo</p>')
    list_test, docids_test = iterate_over_files('marvelWiki2')
    print("iterate_over_files functioning: " + str(bool(len(list_test) == 3)))

    #________________________________ index_builder test

    vocab_test, index_test = index_builder(list_test)
    print("index_builder functioning: " + str(bool(vocab_test['hel-lo'] == 0 and len(index_test[0]) == 3))) 

    #________________________________ build test

    vocab_test, postings_test, docIDs_test = build('marvelWiki2') #input marvelwiki2
    print("build functioning: " + str(bool(vocab_test['hel-lo'] == 0 
                                           and len(postings_test[0]) == 3 
                                           and docIDs_test[1] == 'htmlTest1.html'))) 

    #_________________________________ error_checker test

    print("error_checker functioning: " + str(bool(error_checker('hello', vocab_test) == 'hel-lo')))

    #__________________________________ and_operator test

    posting1_test = [0, 2, 3, 4, 5]
    posting2_test = [0, 1 , 2, 6]

    print("and_operator functioning: " + str(bool(and_operation(posting1_test, posting2_test) == [0, 2])))

    #__________________________________ or_operator test

    print("or_operator functioning: " + str(bool(or_operation(posting1_test, posting2_test) == [0, 1, 2, 3, 4 , 5, 6])))

    #__________________________________ not_operator test

    print("not_operator functioning: " + str(bool(not_operation(posting1_test,posting2_test) == [1, 6])))

    #__________________________________ query_splitter test

    query_split_test = ['manic', 'monkey', 'and', 'smelly', 'green']
    print("query_splitter functioning: " + str(bool(query_splitter(query_split_test) == ['manic-monkey', 'and', 'smelly-green'])))

    #__________________________________ query_wizard test

    print("query_wizard functioning: " + str(bool(query_wizard('hel-lo not hello', vocab_test, postings_test, docIDs_test) == [])))

run tests? (y/n)y
read_file functioning: True
tokenize functioning: True
add_bigrams functioning: True
iterate_over_files functioning: True
index_builder functioning: True
Iterating over all files in folder marvelWiki2 and extracting tokens
token extraction complete in 0.005003452301025391 seconds
Building index
index built in 0.0029952526092529297 seconds
build functioning: True

hello not found, instead searching for "hel lo"
error_checker functioning: True
and_operator functioning: True
or_operator functioning: True
not_operator functioning: False
query_splitter functioning: True

hello not found, instead searching for "hel lo"
query_wizard functioning: True


