# Test Notebook

Before run this notebook:
-  Build the project first
```bash
mkdir build
cd build
cmake ..
make test
```
- And move this notebook to the bin folder
```bash
mv test.ipynb ../bin
```

## Code

In [127]:
import random
import string
import subprocess

Read the dictionary

In [128]:
words = []
with open("../data/american-english-sorted", 'r') as f:
    for word in f.readlines():
        words.append(word.replace('\n', ''))

Generate random prefixes

In [129]:
num_samples = 1000

In [130]:
prefixes = []
for word in random.sample(words, num_samples):
    while(len(word) < 2):
        word = random.choice(words)
    rand_int = random.randint(1, len(word)-1)
    prefix = word[:rand_int]
    prefixes.append(prefix)

In [131]:
prefixes[:5]

['co', 'iceb', 'Malayala', 'ocari', 'end']

To generate random prefixes with a fix length:

In [132]:
def get_random(prefix_len: int):
    while True:
        word = random.choice(words)
        if len(word) >= prefix_len:
            break
    return word[:prefix_len]

def get_result(prefix: str):
    return [x for x in words if x.startswith(prefix)]

In [133]:
prefix = get_random(prefix_len=4)
results = get_result(prefix)
print(prefix) 
print(results)

depr
['deprave', 'depraved', 'depraves', 'depraving', 'depravities', 'depravity', "depravity's", 'deprecate', 'deprecated', 'deprecates', 'deprecating', 'deprecation', "deprecation's", 'deprecatory', 'depreciate', 'depreciated', 'depreciates', 'depreciating', 'depreciation', "depreciation's", 'depredation', "depredation's", 'depredations', 'depress', 'depressant', "depressant's", 'depressants', 'depressed', 'depresses', 'depressing', 'depressingly', 'depression', "depression's", 'depressions', 'depressive', "depressive's", 'depressives', 'deprivation', "deprivation's", 'deprivations', 'deprive', 'deprived', 'deprives', 'depriving', 'deprogram', 'deprogramed', 'deprograming', 'deprogrammed', 'deprogramming', 'deprograms']


In [134]:
def get_cpp_results(text: str):
    program_path = './test_autocomplete'
    arguments = [text]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("output_autocomplete.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    return result

def get_expected_results(text):
    return [x for x in words if x.startswith(text)]

def check(text):
    return get_cpp_results(text) == get_expected_results(text)

In [135]:
check('lamasae')

True

In [136]:
for prefix in prefixes[:20]:
    print(f"Checking {prefix}")
    if not check(prefix):
        raise Exception(f"Failed for {prefix}")

Checking co
Checking iceb
Checking Malayala
Checking ocari
Checking end
Checking waxw
Checking terro
Checking ad
Checking stiffnes
Checking f
Checking C
Checking Mosu
Checking reject
Checking corpu
Checking operation'
Checking consu
Checking hou
Checking Norse
Checking L
Checking comm


# Levestein

In [137]:
import textdistance

In [138]:
retrieve_levestein_dist = textdistance.levenshtein

In [139]:
def levestein(text, max_dist):
    output = []
    for word in words:
        dist = retrieve_levestein_dist(word, text)
        if dist <= max_dist:
            output.append(word)
    return output

def get_cpp_levestein_results(text: str, dist: int):
    program_path = './test_levestein'
    arguments = [text, str(dist)]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("output_levestein.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    result.sort()
    return result

def check_levestein(text, dist):
    return get_cpp_levestein_results(text, dist) == levestein(text, dist)

In [140]:
get_cpp_levestein_results("AAA", 1)

['AA', 'AAA', 'AMA', 'FAA']

In [141]:
levestein("AAA", 1)

['AA', 'AAA', 'AMA', 'FAA']

In [142]:
check_levestein("AAA", 1)

True

In [143]:
# [delete , add, replace, maintain]
maintain = 0.9
others = (1 - maintain)/3
porcentages = [others, others, others, maintain]
assert sum(porcentages) == 1
accumulated = 0
intervals = []
for porcentage in porcentages:
    accumulated += porcentage
    intervals.append(accumulated)
intervals

[0.033333333333333326, 0.06666666666666665, 0.09999999999999998, 1.0]

In [144]:
def get_action(number: float):
    output = 0
    while number > intervals[output]:
        output += 1
    return output

In [145]:
get_action(random.uniform(0, 1))

1

In [146]:
num_samples = 1000

In [147]:
words_levestein = []
originals = []
for word in random.sample(words, num_samples):
    curr_word = ""
    curr_char_idx = 0
    while curr_char_idx != len(word):
        action = get_action(random.uniform(0, 1))
        # Delete char
        if action == 0:
            curr_char_idx += 1
        # Add random char
        elif action == 1:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
        # Replace char
        elif action == 2:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
            curr_char_idx += 1
        # Maintain char
        elif action == 3:
            curr_word += word[curr_char_idx]
            curr_char_idx += 1
    words_levestein.append(curr_word)
    originals.append(word)

In [148]:
index = 1
words_levestein[index], originals[index]

('optionallYy', 'optionally')

In [151]:
errors = []
for word in words_levestein[:20]:
    # print(f"Checking {word}")
    for dist in range(4):
        if not check_levestein(word, dist):
            print(f"Failed for {word} and {dist}")
            errors.append((word, dist))

Failed for malaise's and 3
Failed for shits and 3
Failed for walk and 3
Failed for hoe and 2
Failed for hoe and 3
Failed for He and 2
Failed for He and 3


In [152]:
word, dist = errors[0]
cpp_res = get_cpp_levestein_results(word, dist)
real_res = levestein(word, dist)

In [154]:
word

"malaise's"

In [153]:
[x for x in real_res if x not in cpp_res]

["Héloise's"]