Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
123 changes: 0 additions & 123 deletions bin/word-tokenization-benchmark

This file was deleted.

21 changes: 21 additions & 0 deletions docs/notes/command_line.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,27 @@ You can use some thainlp functions directly from command line.

$ thainlp data --help

**Benchmark**::

thainlp benchmark word-tokenization --input-file <source> --test-file <label> [--save-details]

*Example*::

$thainlp benchmark word-tokenization --input-file wisesight-1000-deepcut.txt --test-file wisesight-1000.label
Benchmarking wisesight-1000-deepcut.txt against .wisesight-1000.label with 993 samples in total
============== Benchmark Result ==============
char_level:tp 17654.0000
char_level:fn 1153.0000
char_level:tn 50755.0000
char_level:fp 1478.0000
char_level:precision 0.9227
char_level:recall 0.9387
word_level:total_words_in_sample 19132.0000
word_level:total_words_in_ref_sample 18807.0000
word_level:correctly_tokenised_words 15637.0000
word_level:precision 0.8173
word_level:recall 0.8314

**Help**::

thainlp --help
4 changes: 2 additions & 2 deletions pythainlp/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@
import sys
from argparse import ArgumentParser

from pythainlp.cli import data, soundex, tag, tokenize
from pythainlp.cli import data, soundex, tag, tokenize, benchmark

# a command should be a verb when possible
COMMANDS = sorted(["data", "soundex", "tag", "tokenize"])
COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"])

CLI_NAME = "thainlp"

Expand Down
169 changes: 169 additions & 0 deletions pythainlp/cli/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import argparse
import json
import os

import yaml

from pythainlp import cli
from pythainlp.benchmarks import word_tokenization


def _read_file(path):
with open(path, "r", encoding="utf-8") as f:
lines = map(lambda r: r.strip(), f.readlines())
return list(lines)


class App:
def __init__(self, argv):
parser = argparse.ArgumentParser(
prog="benchmark",
description=(
"Benchmark for various tasks;"
"currently, we have only for word tokenization."
),
usage=(
"thainlp benchmark [task] [task-options]\n\n"
"tasks:\n\n"
"word-tokenization benchmark word tokenization\n\n"
"--"
),
)

parser.add_argument(
"task", type=str, help="[word-tokenization]"
)

args = parser.parse_args(argv[2:3])
cli.exit_if_empty(args.task, parser)
task = str.lower(args.task)

task_argv = argv[3:]
if task == "word-tokenization":
WordTokenizationBenchmark(task, task_argv)


class WordTokenizationBenchmark:
def __init__(self, name, argv):
parser = argparse.ArgumentParser(**cli.make_usage("benchmark " + name))

parser.add_argument(
"--input-file",
action="store",
help="Path to input file to compare against the test file",
)

parser.add_argument(
"--test-file",
action="store",
help="Path to test file i.e. ground truth",
)

parser.add_argument(
"--save-details",
default=False,
action="store_true",
help=(
"Save comparison details to files (eval-XXX.json"
" and eval-details-XXX.json)"
)
)

args = parser.parse_args(argv)

actual = _read_file(args.input_file)
expected = _read_file(args.test_file)

assert len(actual) == len(expected), \
"Input and test files do not have the same number of samples"

print(
"Benchmarking %s against %s with %d samples in total"
% (args.input_file, args.test_file, len(actual))
)

df_raw = word_tokenization.benchmark(expected, actual)

columns = [
"char_level:tp",
"char_level:fp",
"char_level:tn",
"char_level:fn",
"word_level:correctly_tokenised_words",
"word_level:total_words_in_sample",
"word_level:total_words_in_ref_sample",
]

statistics = dict()

for c in columns:
statistics[c] = float(df_raw[c].sum())

statistics["char_level:precision"] = statistics["char_level:tp"] / (
statistics["char_level:tp"] + statistics["char_level:fp"]
)

statistics["char_level:recall"] = statistics["char_level:tp"] / (
statistics["char_level:tp"] + statistics["char_level:fn"]
)

statistics["word_level:precision"] = \
statistics["word_level:correctly_tokenised_words"] \
/ statistics["word_level:total_words_in_sample"]

statistics["word_level:recall"] = \
statistics["word_level:correctly_tokenised_words"] \
/ statistics["word_level:total_words_in_ref_sample"]

print("============== Benchmark Result ==============")

for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
c = f"char_level:{c}"
v = statistics[c]
print(f"{c:>40s} {v:.4f}")

for c in [
"total_words_in_sample",
"total_words_in_ref_sample",
"correctly_tokenised_words",
"precision",
"recall"
]:
c = f"word_level:{c}"
v = statistics[c]
print(f"{c:>40s} {v:.4f}")

if args.save_details:
dir_name = os.path.dirname(args.input_file)
file_name = args.input_file.split("/")[-1].split(".")[0]

res_path = "%s/eval-%s.yml" % (dir_name, file_name)
print("Evaluation result is saved to %s" % res_path)

with open(res_path, "w", encoding="utf-8") as outfile:
yaml.dump(statistics, outfile, default_flow_style=False)

res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
print("Details of comparisons is saved to %s" % res_path)

with open(res_path, "w", encoding="utf-8") as f:
samples = []
for i, r in enumerate(df_raw.to_dict("records")):
expected, actual = r["expected"], r["actual"]
del r["expected"]
del r["actual"]

samples.append(
dict(
metrics=r,
expected=expected,
actual=actual, id=i
)
)

details = dict(metrics=statistics, samples=samples)

json.dump(details, f, ensure_ascii=False)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
python-crfsuite==0.9.*
requests==2.23.*
tinydb==4.1.*
PyYAML==5.3.1
numpy==1.18.5
6 changes: 2 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

extras = {
"attacut": ["attacut>=1.0.6"],
"benchmarks": ["numpy>=1.16.1", "pandas>=0.24"],
"benchmarks": ["numpy>=1.16.1", "pandas>=0.24", "PyYAML>=5.3.1"],
"icu": ["pyicu>=2.3"],
"ipa": ["epitran>=1.1"],
"ml": ["numpy>=1.16", "torch>=1.0.0"],
Expand All @@ -55,6 +55,7 @@
"thai2rom": ["torch>=1.0.0", "numpy>=1.16.1"],
"wordnet": ["nltk>=3.3.*"],
"full": [
"PyYAML>=5.3.1",
"attacut>=1.0.4",
"emoji>=0.5.1",
"epitran>=1.1",
Expand Down Expand Up @@ -130,9 +131,6 @@
"Topic :: Text Processing :: General",
"Topic :: Text Processing :: Linguistic",
],
scripts=[
"bin/word-tokenization-benchmark",
],
entry_points={
"console_scripts": [
"thainlp = pythainlp.__main__:main",
Expand Down