In [1]:
from words import get_text, words, filenames
import os

# Search Engine Comparision: Linear Search v.s. Hastable Search
Let us imagine we have a scenario where we want to search a bunch of files and return all files which contains a certain list of terms. We will try to carry out this objective with both a traditional linear search, as well as a method using hash tables

In [2]:

def filelist(root):
    root = os.path.expanduser(root)  # Expand the user directory if present
    retlist = []
    
    for root_dir, subdirs, files in os.walk(root):
        for filename in files:
            file_path = os.path.join(root_dir, filename)
            retlist.append(file_path)
    
    return retlist
files = filelist("~/data/slate/")

In [3]:
files

['/Users/tingpan/data/slate/.DS_Store',
 '/Users/tingpan/data/slate/50/ArticleIP_28629.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_25319.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_27730.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_27917.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_21209.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_21547.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_22728.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_21553.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_29696.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_29331.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_27452.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_27446.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_23392.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_21234.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_28210.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_21208.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_22067.txt',
 '/Users/tingpan/data/slate/50/ArticleIP_28238.txt',
 '/Use

In [43]:
def linear_search(files, terms):
    retlist = []
    corpus = [words(get_text(file)) for file in files]
    for i in range(len(files)):
        if set(terms) <= set(corpus[i]):
            retlist.append(files[i])
    return filenames(retlist)


terms = ['picture','perfect','for','and','because','since']
linear_search(files,terms)

['Article247_3653.txt',
 'Article247_4014.txt',
 'ArticleIP_1268.txt',
 'ArticleIP_1553.txt',
 'ArticleIP_2725.txt',
 'ArticleIP_2911.txt',
 'ArticleIP_2922.txt',
 'ArticleIP_2941.txt',
 'ArticleIP_4062.txt',
 'ArticleIP_20511.txt',
 'ArticleIP_25134.txt',
 'ArticleIP_25161.txt',
 'ArticleIP_32882.txt',
 'ArticleIP_38823.txt',
 'ArticleIP_56271.txt']

In [48]:
%time linear_search(files,terms)

CPU times: user 1.18 s, sys: 93.4 ms, total: 1.28 s
Wall time: 1.28 s


['Article247_3653.txt',
 'Article247_4014.txt',
 'ArticleIP_1268.txt',
 'ArticleIP_1553.txt',
 'ArticleIP_2725.txt',
 'ArticleIP_2911.txt',
 'ArticleIP_2922.txt',
 'ArticleIP_2941.txt',
 'ArticleIP_4062.txt',
 'ArticleIP_20511.txt',
 'ArticleIP_25134.txt',
 'ArticleIP_25161.txt',
 'ArticleIP_32882.txt',
 'ArticleIP_38823.txt',
 'ArticleIP_56271.txt']

In [44]:
#Htable

def htable(nbuckets):
    return [[]]*nbuckets

def hashcode(o):
    if isinstance(o, int):
        return o
    elif isinstance(o, str):
        h = 0
        for c in o:
            h = h * 31 + ord(c)
        return h
    else:
        return None

def bucket_indexof(table, key):
    return hashcode(key) % len(table)
    
def htable_put(table, key, value):
    index = bucket_indexof(table, key)
    bucket = table[index]

    # Filter out existing (key, value) pairs with the same key
    bucket = [(k, v) for k, v in bucket if k != key]

    # Append the new (key, value) pair
    bucket.append((key, value))

    # Update the bucket in the table
    table[index] = bucket

    return table

def htable_get(table, key):
    index = bucket_indexof(table, key)
    bucket = table[index]

    for pair in bucket:
        if key == pair[0]:
            return pair[1]
    return None


In [45]:
def myhtable_create_index(files):
    wordtab = htable(4011)
    for doc in files:
        wordlist = set(words(get_text(doc)))
        for word in wordlist:
            oldval = htable_get(wordtab,word)
            if oldval:
                newval = oldval
            else:
                newval = []
            newval.append(doc)
            wordtab = htable_put(wordtab,word,newval)
    return wordtab


def myhtable_index_search(files, index, terms):
    sets_list = []
    for w in terms:
        info = htable_get(index, w)
        if info:
            sets_list.append(set(info))
    
    if not sets_list:
        return sets_list

    # Calculate the intersection of all sets in the 'sets_list' using set.intersection. The * unpacks the sets
    allmatches = set.intersection(*sets_list)
    
    return allmatches.intersection(set(files))

In [49]:
%time index = myhtable_create_index(files)

CPU times: user 14.7 s, sys: 57.5 ms, total: 14.8 s
Wall time: 14.8 s


In [53]:
%time myhtable_index_search(files, index, terms)

CPU times: user 609 µs, sys: 0 ns, total: 609 µs
Wall time: 613 µs


{'/home/karthik/data/slate/13/Article247_3653.txt',
 '/home/karthik/data/slate/17/Article247_4014.txt',
 '/home/karthik/data/slate/24/ArticleIP_1268.txt',
 '/home/karthik/data/slate/27/ArticleIP_1553.txt',
 '/home/karthik/data/slate/39/ArticleIP_2725.txt',
 '/home/karthik/data/slate/40/ArticleIP_2911.txt',
 '/home/karthik/data/slate/40/ArticleIP_2922.txt',
 '/home/karthik/data/slate/40/ArticleIP_2941.txt',
 '/home/karthik/data/slate/48/ArticleIP_4062.txt',
 '/home/karthik/data/slate/50/ArticleIP_20511.txt',
 '/home/karthik/data/slate/50/ArticleIP_25134.txt',
 '/home/karthik/data/slate/50/ArticleIP_25161.txt',
 '/home/karthik/data/slate/51/ArticleIP_32882.txt',
 '/home/karthik/data/slate/51/ArticleIP_38823.txt',
 '/home/karthik/data/slate/53/ArticleIP_56271.txt'}