# Test Notebook

## Code

In [1]:
import random
import string
import subprocess
import os

Read the dictionary

In [2]:
words = []
with open("data/american-english-sorted", 'r') as f:
    for word in f.readlines():
        words.append(word.replace('\n', ''))

Generate random prefixes

In [3]:
num_samples = 10000

In [4]:
prefixes = []
for word in random.sample(words, num_samples):
    while(len(word) < 2):
        word = random.choice(words)
    rand_int = random.randint(1, len(word)-1)
    prefix = word[:rand_int]
    prefixes.append(prefix)

Save prefix examples to file

In [37]:
os.makedirs("tmp", exist_ok=True)
with open("tmp/examples.txt", "w") as f:
    for x in prefixes:
        f.write(x)
        f.write("\n")

In [6]:
prefixes[:5]

['ha', 'pos', 'Bou', 'whitewall', 'w']

To generate random prefixes with a fix length:

In [7]:
def get_random(prefix_len: int):
    while True:
        word = random.choice(words)
        if len(word) >= prefix_len:
            break
    return word[:prefix_len]

def get_result(prefix: str):
    return [x for x in words if x.startswith(prefix)]

In [8]:
prefix = get_random(prefix_len=4)
results = get_result(prefix)
print(prefix) 
print(results)

Plut
['Plutarch', "Plutarch's", 'Pluto', "Pluto's"]


In [9]:
def get_cpp_results(text: str):
    os.chdir("bin")
    program_path = './autocomplete'
    arguments = [text]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("../output_files/output_autocomplete.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    os.chdir("..")
    return result

def get_expected_results(text):
    return [x for x in words if x.startswith(text)]

def check_autocomplete(text):
    return get_cpp_results(text) == get_expected_results(text)

In [10]:
check_autocomplete('lamasae')

True

In [11]:
for prefix in prefixes[:20]:
    print(f"Checking {prefix}")
    if not check_autocomplete(prefix):
        raise Exception(f"Failed for {prefix}")

Checking ha
Checking pos
Checking Bou
Checking whitewall
Checking w
Checking K
Checking B
Checking fragme
Checking fixi
Checking conclu
Checking Ta
Checking tourn
Checking str
Checking misconstruc
Checking c
Checking rile
Checking sab
Checking c
Checking impious
Checking stilln


# Binary Search

In [14]:
def get_bin_search_results(text: str):
    os.chdir("bin")
    program_path = './binary_search'
    arguments = [text]
    subprocess.run([program_path] + arguments)
    with open("../output_files/output_bin_search.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    os.chdir("..")
    return result

def check_bin_search(text):
    return get_bin_search_results(text) == get_expected_results(text)

In [17]:
for prefix in prefixes[:200]:
    print(f"Checking {prefix}")
    if not check_bin_search(prefix):
        raise Exception(f"Failed for {prefix}")

Checking ha
Checking pos
Checking Bou
Checking whitewall
Checking w
Checking K
Checking B
Checking fragme
Checking fixi
Checking conclu
Checking Ta
Checking tourn
Checking str
Checking misconstruc
Checking c
Checking rile
Checking sab
Checking c
Checking impious
Checking stilln
Checking putati
Checking bagatelle'
Checking b
Checking uncertain
Checking stipulati
Checking t
Checking mastermindi
Checking bullf
Checking biol
Checking Sp
Checking feveris
Checking an
Checking jus
Checking mis
Checking Ashley'
Checking burrito
Checking Knop
Checking v
Checking fulfill
Checking Erla
Checking atyp
Checking fe
Checking antide
Checking jaywalke
Checking at
Checking scan
Checking nymphoma
Checking tig
Checking footst
Checking s
Checking misd
Checking diabetic
Checking jo
Checking crabbines
Checking allocat
Checking counci
Checking dupli
Checking tru
Checking confi
Checking M
Checking A
Checking derai
Checking wood
Checking bass
Checking servo
Checking alle
Checking bl
Checking Per
Checking investm

# Levestein

In [18]:
import textdistance

In [19]:
retrieve_levestein_dist = textdistance.levenshtein

In [25]:
def levestein(text, max_dist):
    output = []
    for word in words:
        dist = retrieve_levestein_dist(word, text)
        if dist <= max_dist:
            output.append(word)
    return output

def get_cpp_levestein_results(text: str, dist: int):
    os.chdir("bin")
    program_path = './levestein'
    arguments = [text, str(dist)]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("../output_files/output_levestein.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    result.sort()
    os.chdir("..")
    return result

def check_levestein(text, dist):
    return get_cpp_levestein_results(text, dist) == levestein(text, dist)

In [26]:
get_cpp_levestein_results("AAA", 1)

['AA', 'AAA', 'AMA', 'FAA']

In [27]:
levestein("AAA", 1)

['AA', 'AAA', 'AMA', 'FAA']

In [28]:
check_levestein("AAA", 1)

True

In [29]:
# [delete , add, replace, maintain]
maintain = 0.9
others = (1 - maintain)/3
porcentages = [others, others, others, maintain]
assert sum(porcentages) == 1
accumulated = 0
intervals = []
for porcentage in porcentages:
    accumulated += porcentage
    intervals.append(accumulated)

In [30]:
def get_action(number: float):
    output = 0
    while number > intervals[output]:
        output += 1
    return output

In [31]:
num_samples = 10

In [32]:
words_levestein = []
originals = []
for word in random.sample(words, num_samples):
    curr_word = ""
    curr_char_idx = 0
    while curr_char_idx != len(word):
        action = get_action(random.uniform(0, 1))
        # Delete char
        if action == 0:
            curr_char_idx += 1
        # Add random char
        elif action == 1:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
        # Replace char
        elif action == 2:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
            curr_char_idx += 1
        # Maintain char
        elif action == 3:
            curr_word += word[curr_char_idx]
            curr_char_idx += 1
    words_levestein.append(curr_word)
    originals.append(word)

In [33]:
index = 1
words_levestein[index], originals[index]

('upraised', 'upraised')

In [34]:
errors = []
for word in words_levestein[:100]:
    print(f"Checking {word}")
    for dist in range(4):
        if not check_levestein(word, dist):
            print(f"Failed for {word} and {dist}")
            errors.append((word, dist))

Checking bassinets
Checking upraised
Checking eJcantaoionX
Checking narcisistic
Checking peacocks
Checking GilEore's
Checking Vindemiatrix's
Checking typeset
Checking refffirms
Checking drenching
