# Test Notebook

## Code

In [1]:
import random
import string
import subprocess
import os

Read the dictionary

In [2]:
words = []
with open("data/american-english-sorted", 'r') as f:
    for word in f.readlines():
        words.append(word.replace('\n', ''))

Generate random prefixes

In [3]:
num_samples = 10000

In [4]:
prefixes = []
for word in random.sample(words, num_samples):
    while(len(word) < 2):
        word = random.choice(words)
    rand_int = random.randint(1, len(word)-1)
    prefix = word[:rand_int]
    prefixes.append(prefix)

In [5]:
with open("examples.txt", "w") as f:
    for x in prefixes:
        f.write(x)
        f.write("\n")

In [6]:
prefixes[:5]

['Ken', 'wi', 'in', 'six', 'Cha']

To generate random prefixes with a fix length:

In [7]:
def get_random(prefix_len: int):
    while True:
        word = random.choice(words)
        if len(word) >= prefix_len:
            break
    return word[:prefix_len]

def get_result(prefix: str):
    return [x for x in words if x.startswith(prefix)]

In [8]:
prefix = get_random(prefix_len=4)
results = get_result(prefix)
print(prefix) 
print(results)

orth
['orthodontia', "orthodontia's", 'orthodontic', 'orthodontics', "orthodontics's", 'orthodontist', "orthodontist's", 'orthodontists', 'orthodox', 'orthodoxies', 'orthodoxy', "orthodoxy's", 'orthogonal', 'orthogonality', 'orthographic', 'orthographies', 'orthography', "orthography's", 'orthopaedic', 'orthopaedics', "orthopaedics's", 'orthopaedist', "orthopaedist's", 'orthopaedists', 'orthopedic', 'orthopedics', "orthopedics's", 'orthopedist', "orthopedist's", 'orthopedists']


In [9]:
def get_cpp_results(text: str):
    os.chdir("bin")
    program_path = './autocomplete'
    arguments = [text]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("output_autocomplete.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    os.chdir("..")
    return result

def get_expected_results(text):
    return [x for x in words if x.startswith(text)]

def check_autocomplete(text):
    return get_cpp_results(text) == get_expected_results(text)

In [10]:
check_autocomplete('lamasae')

True

In [11]:
for prefix in prefixes[:20]:
    print(f"Checking {prefix}")
    if not check_autocomplete(prefix):
        raise Exception(f"Failed for {prefix}")

Checking Ken
Checking wi
Checking in
Checking six
Checking Cha
Checking u
Checking shirk
Checking dev
Checking spr
Checking collision
Checking all
Checking ove
Checking doo
Checking hosan
Checking licor
Checking separat
Checking se
Checking ridge
Checking gen
Checking pa


# Binary Search

In [12]:
def get_bin_search_results(text: str):
    os.chdir("bin")
    program_path = './binary_search'
    arguments = [text]
    subprocess.run([program_path] + arguments)
    with open("output_bin_search.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    os.chdir("..")
    return result

def check_bin_search(text):
    return get_bin_search_results(text) == get_expected_results(text)

In [13]:
for prefix in prefixes[:200]:
    print(f"Checking {prefix}")
    if not check_bin_search(prefix):
        raise Exception(f"Failed for {prefix}")

Checking Ken
Checking wi
Checking in
Checking six
Checking Cha
Checking u
Checking shirk
Checking dev
Checking spr
Checking collision
Checking all
Checking ove
Checking doo
Checking hosan
Checking licor
Checking separat
Checking se
Checking ridge
Checking gen
Checking pa
Checking rel
Checking mong
Checking ma
Checking si
Checking fuz
Checking underworl
Checking interloc
Checking freigh
Checking Marva'
Checking i
Checking In
Checking condition'
Checking Claudin
Checking J
Checking Pytha
Checking cheapl
Checking disciplin
Checking defamat
Checking tercenten
Checking teapot
Checking r
Checking ba
Checking initia
Checking engi
Checking a
Checking t
Checking im
Checking ambulato
Checking Nor
Checking rigmaro
Checking nuisance'
Checking Acru
Checking UT'
Checking Agustin'
Checking em
Checking ap
Checking dat
Checking e
Checking pander
Checking Heisenberg
Checking dunn
Checking Trum
Checking wo
Checking s
Checking dr
Checking fleecin
Checking prett
Checking spr
Checking Sa
Checking hoopl
Chec

# Levestein

In [None]:
import textdistance

In [None]:
retrieve_levestein_dist = textdistance.levenshtein

In [None]:
def levestein(text, max_dist):
    output = []
    for word in words:
        dist = retrieve_levestein_dist(word, text)
        if dist <= max_dist:
            output.append(word)
    return output

def get_cpp_levestein_results(text: str, dist: int):
    os.chdir("bin")
    program_path = './levestein'
    arguments = [text, str(dist)]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("output_levestein.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    result.sort()
    os.chdir("..")
    return result

def check_levestein(text, dist):
    return get_cpp_levestein_results(text, dist) == levestein(text, dist)

In [None]:
get_cpp_levestein_results("AAA", 1)

In [None]:
levestein("AAA", 1)

In [None]:
check_levestein("AAA", 1)

In [None]:
# [delete , add, replace, maintain]
maintain = 0.9
others = (1 - maintain)/3
porcentages = [others, others, others, maintain]
assert sum(porcentages) == 1
accumulated = 0
intervals = []
for porcentage in porcentages:
    accumulated += porcentage
    intervals.append(accumulated)

In [None]:
def get_action(number: float):
    output = 0
    while number > intervals[output]:
        output += 1
    return output

In [None]:
num_samples = 10

In [None]:
words_levestein = []
originals = []
for word in random.sample(words, num_samples):
    curr_word = ""
    curr_char_idx = 0
    while curr_char_idx != len(word):
        action = get_action(random.uniform(0, 1))
        # Delete char
        if action == 0:
            curr_char_idx += 1
        # Add random char
        elif action == 1:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
        # Replace char
        elif action == 2:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
            curr_char_idx += 1
        # Maintain char
        elif action == 3:
            curr_word += word[curr_char_idx]
            curr_char_idx += 1
    words_levestein.append(curr_word)
    originals.append(word)

In [None]:
index = 1
words_levestein[index], originals[index]

In [None]:
errors = []
for word in words_levestein[:100]:
    print(f"Checking {word}")
    for dist in range(4):
        if not check_levestein(word, dist):
            print(f"Failed for {word} and {dist}")
            errors.append((word, dist))