# Test Notebook

Before run this notebook:
-  Build the project first
```bash
mkdir build
cd build
cmake ..
make test
```
- And move this notebook to the bin folder
```bash
mv test.ipynb ../bin
```

## Code

In [1]:
import random
import string
import subprocess

Read the dictionary

In [2]:
words = []
with open("../data/american-english-sorted", 'r') as f:
    for word in f.readlines():
        words.append(word.replace('\n', ''))

Generate random prefixes

In [3]:
num_samples = 1000

In [4]:
prefixes = []
for word in random.sample(words, num_samples):
    while(len(word) < 2):
        word = random.choice(words)
    rand_int = random.randint(1, len(word)-1)
    prefix = word[:rand_int]
    prefixes.append(prefix)

In [5]:
prefixes[:5]

['forbi', 'sabotage', 'symptom', 'damson', 'Flemi']

To generate random prefixes with a fix length:

In [6]:
def get_random(prefix_len: int):
    while True:
        word = random.choice(words)
        if len(word) >= prefix_len:
            break
    return word[:prefix_len]

def get_result(prefix: str):
    return [x for x in words if x.startswith(prefix)]

In [7]:
prefix = get_random(prefix_len=4)
results = get_result(prefix)
print(prefix) 
print(results)

demi
['demigod', "demigod's", 'demigods', 'demijohn', "demijohn's", 'demijohns', 'demilitarization', "demilitarization's", 'demilitarize', 'demilitarized', 'demilitarizes', 'demilitarizing', 'demise', "demise's", 'demised', 'demises', 'demising', 'demitasse', "demitasse's", 'demitasses']


In [8]:
def get_cpp_results(text: str):
    program_path = './test_autocomplete'
    arguments = [text]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("output_autocomplete.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    return result

def get_expected_results(text):
    return [x for x in words if x.startswith(text)]

def check(text):
    return get_cpp_results(text) == get_expected_results(text)

In [9]:
check('lamasae')

True

In [10]:
for prefix in prefixes[:20]:
    print(f"Checking {prefix}")
    if not check(prefix):
        raise Exception(f"Failed for {prefix}")

Checking forbi
Checking sabotage
Checking symptom
Checking damson
Checking Flemi
Checking Mash
Checking Franc
Checking telecon
Checking Mes
Checking port
Checking stamen
Checking upgrad
Checking Tweedl
Checking ten
Checking Galv
Checking thi
Checking ki
Checking Strindberg'
Checking puckere
Checking analo


# Levestein

In [11]:
import textdistance

In [12]:
retrieve_levestein_dist = textdistance.levenshtein

In [13]:
def levestein(text, max_dist):
    output = []
    for word in words:
        dist = retrieve_levestein_dist(word, text)
        if dist <= max_dist:
            output.append(word)
    return output

def get_cpp_levestein_results(text: str, dist: int):
    program_path = './test_levestein'
    arguments = [text, str(dist)]
    subprocess.run([program_path] + arguments, text=True, capture_output=True)
    with open("output_levestein.txt", 'r') as f:
        result = [x.replace('\n', '') for x in f.readlines()]
    result.sort()
    return result

def check_levestein(text, dist):
    return get_cpp_levestein_results(text, dist) == levestein(text, dist)

In [14]:
get_cpp_levestein_results("AAA", 1)

['AA', 'AAA', 'AMA', 'FAA']

In [15]:
levestein("AAA", 1)

['AA', 'AAA', 'AMA', 'FAA']

In [16]:
check_levestein("AAA", 1)

True

In [17]:
# [delete , add, replace, maintain]
maintain = 0.9
others = (1 - maintain)/3
porcentages = [others, others, others, maintain]
assert sum(porcentages) == 1
accumulated = 0
intervals = []
for porcentage in porcentages:
    accumulated += porcentage
    intervals.append(accumulated)

In [18]:
def get_action(number: float):
    output = 0
    while number > intervals[output]:
        output += 1
    return output

In [19]:
num_samples = 1000

In [20]:
words_levestein = []
originals = []
for word in random.sample(words, num_samples):
    curr_word = ""
    curr_char_idx = 0
    while curr_char_idx != len(word):
        action = get_action(random.uniform(0, 1))
        # Delete char
        if action == 0:
            curr_char_idx += 1
        # Add random char
        elif action == 1:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
        # Replace char
        elif action == 2:
            random_char = random.choice(string.ascii_letters)
            curr_word += random_char
            curr_char_idx += 1
        # Maintain char
        elif action == 3:
            curr_word += word[curr_char_idx]
            curr_char_idx += 1
    words_levestein.append(curr_word)
    originals.append(word)

In [21]:
index = 1
words_levestein[index], originals[index]

("spxout's", "spout's")

In [22]:
errors = []
for word in words_levestein[:100]:
    # print(f"Checking {word}")
    for dist in range(4):
        if not check_levestein(word, dist):
            print(f"Failed for {word} and {dist}")
            errors.append((word, dist))

In [23]:
word, dist = "musOrroom", 2
cpp_res = get_cpp_levestein_results(word, dist)
real_res = levestein(word, dist)

In [24]:
word, [x for x in real_res if x not in cpp_res]

('musOrroom', [])