# Test Notebook

## Code

In [1]:
import random
import string
import subprocess
import os

In [2]:
os.chdir("..")

Read the dictionary

In [3]:
words = []
with open("data/american_english_sorted.txt", 'r') as f:
    for word in f.readlines():
        words.append(word.replace('\n', ''))

Generate random prefixes

In [4]:
num_samples = 10000

In [5]:
prefixes = []
for word in random.sample(words, num_samples):
    while(len(word) < 2):
        word = random.choice(words)
    rand_int = random.randint(1, len(word)-1)
    prefix = word[:rand_int]
    prefixes.append(prefix)

Save prefix examples to file

In [6]:
os.makedirs("tmp", exist_ok=True)
with open("tmp/examples.txt", "w") as f:
    for x in prefixes:
        f.write(x)
        f.write("\n")

In [7]:
prefixes[:5]

['no', 'h', 'Mohamme', 'lac', 'dioxi']

To generate random prefixes with a fix length:

In [8]:
def get_random(prefix_len: int):
    while True:
        word = random.choice(words)
        if len(word) >= prefix_len:
            break
    return word[:prefix_len]

def get_result(prefix: str):
    return [x for x in words if x.startswith(prefix)]

In [9]:
prefix = get_random(prefix_len=4)
results = get_result(prefix)
print(prefix) 
print(results)

Jidd
['Jidda', "Jidda's"]


In [16]:
def gen_results():
    os.remove("output_files/output_autocomplete.txt")
    os.chdir("bin")
    program_path = './autocomplete'
    arguments = ["../tmp/examples.txt"]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    os.chdir("..")

def get_cpp_results():
    results = []
    with open("output_files/output_autocomplete.txt", 'r') as f:
        for line in f.readlines():
            line = line.replace("\n", "").strip()
            if line == ']':
                yield results
                results = []
            if line and not line == '[' and not line == ']':
                results.append(line)
            

def gen_expected_results():
    with open("tmp/examples.txt", 'r') as f:
        for line in f.readlines():
            line = line.replace('\n', '').strip()
            yield [x for x in words if x.startswith(line)]

def check_autocomplete():
    keys = []
    with open("tmp/examples.txt", 'r') as f:
        for line in f.readlines():
            line = line.replace('\n', '').strip()
            keys.append(line)
        
    for key, l1, l2 in zip(keys, get_cpp_results(), gen_expected_results()):
        if l1 != l2:
            print(f"Failed for {key}")
            print([x for x in l1 if x not in l2])
            print([x for x in l2 if x not in l1])
            return False
        else:
            print(f"Checked {key}")

In [17]:
gen_results()

In [18]:
check_autocomplete()

Checked di
Checked Powe
Checked ony
Checked Sveng
Checked tonsu
Checked leav
Checked c
Checked dr
Checked smu
Checked bobwhite
Checked hermita
Checked ground
Checked inu
Checked K
Checked zo
Checked onslaug
Checked comprehe
Checked am
Checked cru
Checked clearinghouse'
Checked Chang
Checked Myst'
Checked todd
Checked cro
Checked F
Checked e
Checked intrica
Checked phonolo
Checked c
Checked uncommon
Checked umb
Checked Hellm
Checked mirthf
Checked mispronunci
Checked Robes
Checked emolum
Checked Massachuse
Checked unan
Checked pl
Checked wayfarer'
Checked ser
Checked lo
Checked sette
Checked ventri
Checked nexu
Checked in
Checked Miss
Checked deafnes
Checked Evangel
Checked com
Checked bere
Checked c
Checked wi
Checked sympathiz
Checked blows
Checked Sophoclean
Checked wadi
Checked hypoglyce
Checked randomize
Checked swishe
Checked ju
Checked Cl
Checked commanda
Checked airpl
Checked cou
Checked doctrinair
Checked disambigua
Checked Loe
Checked motorca
Checked si
Checked wrongdoe
Checke

# Binary Search

In [19]:
def get_bin_search_results(text: str):
    os.chdir("bin")
    program_path = './binary_search'
    arguments = [text]
    subprocess.run([program_path] + arguments)
    with open("../output_files/output_bin_search.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    os.chdir("..")
    return result

def check_bin_search(text):
    l1 = get_bin_search_results(text)
    l2 = [x for x in words if x.startswith(text)]
    return l1 == l2

In [20]:
for prefix in prefixes:
    print(f"Checking {prefix}")
    if not check_bin_search(prefix):
        raise Exception(f"Failed for {prefix}")

Checking di
Checking Powe
Checking ony
Checking Sveng
Checking tonsu
Checking leav
Checking c
Checking dr
Checking smu
Checking bobwhite
Checking hermita
Checking ground
Checking inu
Checking K
Checking zo
Checking onslaug
Checking comprehe
Checking am
Checking cru
Checking clearinghouse'
Checking Chang
Checking Myst'
Checking todd
Checking cro
Checking F
Checking e
Checking intrica
Checking phonolo
Checking c
Checking uncommon
Checking umb
Checking Hellm
Checking mirthf
Checking mispronunci
Checking Robes
Checking emolum
Checking Massachuse
Checking unan
Checking pl
Checking wayfarer'
Checking ser
Checking lo
Checking sette
Checking ventri
Checking nexu
Checking in
Checking Miss
Checking deafnes
Checking Evangel
Checking com
Checking bere
Checking c
Checking wi
Checking sympathiz
Checking blows
Checking Sophoclean
Checking wadi
Checking hypoglyce
Checking randomize
Checking swishe
Checking ju
Checking Cl
Checking commanda
Checking airpl
Checking cou
Checking doctrinair
Checking disamb

# Levestein

In [21]:
import textdistance

In [22]:
retrieve_levestein_dist = textdistance.levenshtein

In [23]:
def levestein(text, max_dist):
    output = []
    for word in words:
        dist = retrieve_levestein_dist(word, text)
        if dist <= max_dist:
            output.append(word)
    return output

def get_cpp_levestein_results(text: str, dist: int):
    os.chdir("bin")
    program_path = './levestein'
    arguments = [text, str(dist)]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("../output_files/output_levestein.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    result.sort()
    os.chdir("..")
    return result

def check_levestein(text, dist):
    return get_cpp_levestein_results(text, dist) == levestein(text, dist)

In [24]:
get_cpp_levestein_results("AAA", 1)

['AA', 'AAA', 'AMA', 'FAA']

In [25]:
levestein("AAA", 1)

['AA', 'AAA', 'AMA', 'FAA']

In [26]:
check_levestein("AAA", 1)

True

In [27]:
# [delete , add, replace, maintain]
maintain = 0.9
others = (1 - maintain)/3
porcentages = [others, others, others, maintain]
assert sum(porcentages) == 1
accumulated = 0
intervals = []
for porcentage in porcentages:
    accumulated += porcentage
    intervals.append(accumulated)

In [28]:
def get_action(number: float):
    output = 0
    while number > intervals[output]:
        output += 1
    return output

In [29]:
num_samples = 10

In [30]:
words_levestein = []
originals = []
for word in random.sample(words, num_samples):
    curr_word = ""
    curr_char_idx = 0
    while curr_char_idx != len(word):
        action = get_action(random.uniform(0, 1))
        # Delete char
        if action == 0:
            curr_char_idx += 1
        # Add random char
        elif action == 1:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
        # Replace char
        elif action == 2:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
            curr_char_idx += 1
        # Maintain char
        elif action == 3:
            curr_word += word[curr_char_idx]
            curr_char_idx += 1
    words_levestein.append(curr_word)
    originals.append(word)

In [31]:
index = 1
words_levestein[index], originals[index]

("flLmflam's", "flimflam's")

In [32]:
errors = []
for word in words_levestein[:100]:
    print(f"Checking {word}")
    for dist in range(4):
        if not check_levestein(word, dist):
            print(f"Failed for {word} and {dist}")
            errors.append((word, dist))

Checking tcrunchbest
Checking flLmflam's
Checking Brvt's
Checking readYness's
Checking deZlphinium's
Checking us's
Checking glove
Checking connctor
Checking muttering
Checking expdites
