# Test Notebook

## Code

In [1]:
import random
import string
import subprocess
import os

Read the dictionary

In [2]:
words = []
with open("data/american-english-sorted", 'r') as f:
    for word in f.readlines():
        words.append(word.replace('\n', ''))

Generate random prefixes

In [3]:
num_samples = 1000

In [4]:
prefixes = []
for word in random.sample(words, num_samples):
    while(len(word) < 2):
        word = random.choice(words)
    rand_int = random.randint(1, len(word)-1)
    prefix = word[:rand_int]
    prefixes.append(prefix)

In [5]:
prefixes[:5]

['L', 'T', 'b', 'disl', 's']

To generate random prefixes with a fix length:

In [6]:
def get_random(prefix_len: int):
    while True:
        word = random.choice(words)
        if len(word) >= prefix_len:
            break
    return word[:prefix_len]

def get_result(prefix: str):
    return [x for x in words if x.startswith(prefix)]

In [7]:
prefix = get_random(prefix_len=4)
results = get_result(prefix)
print(prefix) 
print(results)

Chri
['Chris', "Chris's", 'Christ', "Christ's", 'Christa', "Christa's", 'Christchurch', "Christchurch's", 'Christendom', "Christendom's", 'Christendoms', 'Christensen', "Christensen's", 'Christi', "Christi's", 'Christian', "Christian's", 'Christianities', 'Christianity', "Christianity's", 'Christians', 'Christie', "Christie's", 'Christina', "Christina's", 'Christine', "Christine's", 'Christmas', "Christmas's", 'Christmases', 'Christoper', "Christoper's", 'Christopher', "Christopher's", 'Christs', 'Christy', "Christy's"]


In [8]:
def get_cpp_results(text: str):
    os.chdir("bin")
    program_path = './autocomplete'
    arguments = [text]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("output_autocomplete.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    os.chdir("..")
    return result

def get_expected_results(text):
    return [x for x in words if x.startswith(text)]

def check(text):
    return get_cpp_results(text) == get_expected_results(text)

In [9]:
check('lamasae')

True

In [10]:
for prefix in prefixes[:20]:
    print(f"Checking {prefix}")
    if not check(prefix):
        raise Exception(f"Failed for {prefix}")

Checking L
Checking T
Checking b
Checking disl
Checking s
Checking uteru
Checking plum
Checking rei
Checking bu
Checking discr
Checking fee
Checking myster
Checking bloodsho
Checking Ez
Checking saintlie
Checking Ro
Checking nurtur
Checking f
Checking ma
Checking fighter


# Levestein

In [11]:
import textdistance

In [12]:
retrieve_levestein_dist = textdistance.levenshtein

In [18]:
def levestein(text, max_dist):
    output = []
    for word in words:
        dist = retrieve_levestein_dist(word, text)
        if dist <= max_dist:
            output.append(word)
    return output

def get_cpp_levestein_results(text: str, dist: int):
    os.chdir("bin")
    program_path = './levestein'
    arguments = [text, str(dist)]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("output_levestein.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    result.sort()
    os.chdir("..")
    return result

def check_levestein(text, dist):
    return get_cpp_levestein_results(text, dist) == levestein(text, dist)

In [19]:
get_cpp_levestein_results("AAA", 1)

['AA', 'AAA', 'AMA', 'FAA']

In [20]:
levestein("AAA", 1)

['AA', 'AAA', 'AMA', 'FAA']

In [21]:
check_levestein("AAA", 1)

True

In [22]:
# [delete , add, replace, maintain]
maintain = 0.9
others = (1 - maintain)/3
porcentages = [others, others, others, maintain]
assert sum(porcentages) == 1
accumulated = 0
intervals = []
for porcentage in porcentages:
    accumulated += porcentage
    intervals.append(accumulated)

In [23]:
def get_action(number: float):
    output = 0
    while number > intervals[output]:
        output += 1
    return output

In [25]:
num_samples = 10

In [26]:
words_levestein = []
originals = []
for word in random.sample(words, num_samples):
    curr_word = ""
    curr_char_idx = 0
    while curr_char_idx != len(word):
        action = get_action(random.uniform(0, 1))
        # Delete char
        if action == 0:
            curr_char_idx += 1
        # Add random char
        elif action == 1:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
        # Replace char
        elif action == 2:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
            curr_char_idx += 1
        # Maintain char
        elif action == 3:
            curr_word += word[curr_char_idx]
            curr_char_idx += 1
    words_levestein.append(curr_word)
    originals.append(word)

In [27]:
index = 1
words_levestein[index], originals[index]

('fulfillmentWs', "fulfillment's")

In [31]:
errors = []
for word in words_levestein[:100]:
    print(f"Checking {word}")
    for dist in range(4):
        if not check_levestein(word, dist):
            print(f"Failed for {word} and {dist}")
            errors.append((word, dist))

Checking hooIp
Checking fulfillmentWs
Checking Chagall
Checking zstrly'x
Checking avaricLous
Checking adZultery's
Checking baffle
Checking viticulture's
Checking Gillihn
Checking NM
