In [1]:
import csv

from sources.tokenizer.custom_tokenizer import CustomTokenizer
from sources.tokenizer.comparison.lang_tokenizer import LangTokenizer
from sources.tokenizer.comparison.spacy_tokenizer import SpacyTokenizer
from sources.tokenizer.comparison.ntlk_tokenizer import NTLKTokenizer
from sources.tokenizer.comparison.language_tool.language_tool_tokenizer import LanguageToolTokenizer


tokens = [
    "будь-який",
    "бозна-що",
    "2022-му",
    "№11",
    "22:37",
    "18.05.2021",
    "15-21",
    "37-річна",
    "rename@city.kharkiv.ua",
    "+38 (095) 568 38 77",
    "$1,461",
    "-21°С",
    "-11",
    "м.",
    "вул.",
    "пл.",
    "ред.",
    "під'їздів",
    "№3,4",
    "№1-4",
    "м/с",
    "234,5",
    "м²",
    "~100",
    "км²/місяць",
    "1 000 000",
]

tokenizers = [
    CustomTokenizer(),
    # LangTokenizer(),
    SpacyTokenizer(),
    # NTLKTokenizer(),
    # LanguageToolTokenizer(),
]

divider = "|"
input_text = f" {divider} ".join(tokens)


results = {tokenizer.name(): tokenizer.tokenize(input_text) for tokenizer in tokenizers}

table = []
header = ["Expected"] + list(results.keys())
correct_counts = {name: 0 for name in results}

for expected_token in tokens:
    row = [expected_token]

    for name, tokenized in results.items():
        found_tokens = []
        while tokenized:
            token = tokenized.pop(0)
            if token == divider:
                break 

            found_tokens.append(token)

        if len(found_tokens) == 1 and found_tokens[0] == expected_token:
            row.append("-")
            correct_counts[name] += 1
        else:
            row.append(" ".join(found_tokens))

    table.append(row)

total_tokens = len(tokens)
accuracy_row = ["Results"] + [
    f"{(correct_counts[name] / total_tokens) * 100:.2f}%" for name in results
]
table.append(accuracy_row)


csv_filename = "tokenization_results.csv"

with open(csv_filename, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerow(header)
    writer.writerows(table)

print(f"Table saved to {csv_filename}")

print("Results:")
for name, correct_count in correct_counts.items():
    print(f"{name}: {correct_count}/{total_tokens} ({(correct_count / total_tokens) * 100:.2f}%)")





Table saved to tokenization_results.csv
Results:
Custom Tokenizer: 25/26 (96.15%)
spacy: 14/26 (53.85%)
