PyThaiNLP · wannaphong · Jun 14, 2020 · Jun 7, 2020 · Jun 7, 2020 · Jun 7, 2020
diff --git a/bin/word-tokenization-benchmark b/bin/word-tokenization-benchmark
diff --git a/docs/notes/command_line.rst b/docs/notes/command_line.rst
@@ -73,6 +73,27 @@ You can use some thainlp functions directly from command line.
 
     $ thainlp data --help
 
+**Benchmark**::
+
+    thainlp  benchmark word-tokenization --input-file <source> --test-file <label> [--save-details]
+
+*Example*::
+
+    $thainlp  benchmark word-tokenization --input-file wisesight-1000-deepcut.txt --test-file wisesight-1000.label
+    Benchmarking wisesight-1000-deepcut.txt against .wisesight-1000.label with 993 samples in total
+    ============== Benchmark Result ==============
+                           char_level:tp 17654.0000
+                           char_level:fn 1153.0000
+                           char_level:tn 50755.0000
+                           char_level:fp 1478.0000
+                    char_level:precision 0.9227
+                       char_level:recall 0.9387
+        word_level:total_words_in_sample 19132.0000
+    word_level:total_words_in_ref_sample 18807.0000
+    word_level:correctly_tokenised_words 15637.0000
+                    word_level:precision 0.8173
+                       word_level:recall 0.8314
+
 **Help**::
 
     thainlp --help
diff --git a/pythainlp/cli/__init__.py b/pythainlp/cli/__init__.py
@@ -2,10 +2,10 @@
 import sys
 from argparse import ArgumentParser
 
-from pythainlp.cli import data, soundex, tag, tokenize
+from pythainlp.cli import data, soundex, tag, tokenize, benchmark
 
 # a command should be a verb when possible
-COMMANDS = sorted(["data", "soundex", "tag", "tokenize"])
+COMMANDS = sorted(["data", "soundex", "tag", "tokenize", "benchmark"])
 
 CLI_NAME = "thainlp"
 

diff --git a/pythainlp/cli/benchmark.py b/pythainlp/cli/benchmark.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import json
+import os
+
+import yaml
+
+from pythainlp import cli
+from pythainlp.benchmarks import word_tokenization
+
+
+def _read_file(path):
+    with open(path, "r", encoding="utf-8") as f:
+        lines = map(lambda r: r.strip(), f.readlines())
+    return list(lines)
+
+
+class App:
+    def __init__(self, argv):
+        parser = argparse.ArgumentParser(
+            prog="benchmark",
+            description=(
+                "Benchmark for various tasks;"
+                "currently, we have only for word tokenization."
+            ),
+            usage=(
+                "thainlp benchmark [task] [task-options]\n\n"
+                "tasks:\n\n"
+                "word-tokenization      benchmark word tokenization\n\n"
+                "--"
+            ),
+        )
+
+        parser.add_argument(
+            "task", type=str, help="[word-tokenization]"
+        )
+
+        args = parser.parse_args(argv[2:3])
+        cli.exit_if_empty(args.task, parser)
+        task = str.lower(args.task)
+
+        task_argv = argv[3:]
+        if task == "word-tokenization":
+            WordTokenizationBenchmark(task, task_argv)
+
+
+class WordTokenizationBenchmark:
+    def __init__(self, name, argv):
+        parser = argparse.ArgumentParser(**cli.make_usage("benchmark " + name))
+
+        parser.add_argument(
+            "--input-file",
+            action="store",
+            help="Path to input file to compare against the test file",
+        )
+
+        parser.add_argument(
+            "--test-file",
+            action="store",
+            help="Path to test file i.e. ground truth",
+        )
+
+        parser.add_argument(
+            "--save-details",
+            default=False,
+            action="store_true",
+            help=(
+                "Save comparison details to files (eval-XXX.json"
+                " and eval-details-XXX.json)"
+            )
+        )
+
+        args = parser.parse_args(argv)
+
+        actual = _read_file(args.input_file)
+        expected = _read_file(args.test_file)
+
+        assert len(actual) == len(expected), \
+            "Input and test files do not have the same number of samples"
+
+        print(
+            "Benchmarking %s against %s with %d samples in total"
+            % (args.input_file, args.test_file, len(actual))
+        )
+
+        df_raw = word_tokenization.benchmark(expected, actual)
+
+        columns = [
+            "char_level:tp",
+            "char_level:fp",
+            "char_level:tn",
+            "char_level:fn",
+            "word_level:correctly_tokenised_words",
+            "word_level:total_words_in_sample",
+            "word_level:total_words_in_ref_sample",
+        ]
+
+        statistics = dict()
+
+        for c in columns:
+            statistics[c] = float(df_raw[c].sum())
+
+        statistics["char_level:precision"] = statistics["char_level:tp"] / (
+            statistics["char_level:tp"] + statistics["char_level:fp"]
+        )
+
+        statistics["char_level:recall"] = statistics["char_level:tp"] / (
+            statistics["char_level:tp"] + statistics["char_level:fn"]
+        )
+
+        statistics["word_level:precision"] = \
+            statistics["word_level:correctly_tokenised_words"] \
+            / statistics["word_level:total_words_in_sample"]
+
+        statistics["word_level:recall"] = \
+            statistics["word_level:correctly_tokenised_words"] \
+            / statistics["word_level:total_words_in_ref_sample"]
+
+        print("============== Benchmark Result ==============")
+
+        for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
+            c = f"char_level:{c}"
+            v = statistics[c]
+            print(f"{c:>40s} {v:.4f}")
+
+        for c in [
+                    "total_words_in_sample",
+                    "total_words_in_ref_sample",
+                    "correctly_tokenised_words",
+                    "precision",
+                    "recall"
+                 ]:
+            c = f"word_level:{c}"
+            v = statistics[c]
+            print(f"{c:>40s} {v:.4f}")
+
+        if args.save_details:
+            dir_name = os.path.dirname(args.input_file)
+            file_name = args.input_file.split("/")[-1].split(".")[0]
+
+            res_path = "%s/eval-%s.yml" % (dir_name, file_name)
+            print("Evaluation result is saved to %s" % res_path)
+
+            with open(res_path, "w", encoding="utf-8") as outfile:
+                yaml.dump(statistics, outfile, default_flow_style=False)
+
+            res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
+            print("Details of comparisons is saved to %s" % res_path)
+
+            with open(res_path, "w", encoding="utf-8") as f:
+                samples = []
+                for i, r in enumerate(df_raw.to_dict("records")):
+                    expected, actual = r["expected"], r["actual"]
+                    del r["expected"]
+                    del r["actual"]
+
+                    samples.append(
+                        dict(
+                            metrics=r,
+                            expected=expected,
+                            actual=actual, id=i
+                        )
+                    )
+
+                details = dict(metrics=statistics, samples=samples)
+
+                json.dump(details, f, ensure_ascii=False)
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,5 @@
 python-crfsuite==0.9.*
 requests==2.23.*
 tinydb==4.1.*
+PyYAML==5.3.1
+numpy==1.18.5
diff --git a/setup.py b/setup.py
@@ -46,7 +46,7 @@
 
 extras = {
     "attacut": ["attacut>=1.0.6"],
-    "benchmarks": ["numpy>=1.16.1", "pandas>=0.24"],
+    "benchmarks": ["numpy>=1.16.1", "pandas>=0.24", "PyYAML>=5.3.1"],
     "icu": ["pyicu>=2.3"],
     "ipa": ["epitran>=1.1"],
     "ml": ["numpy>=1.16", "torch>=1.0.0"],
@@ -55,6 +55,7 @@
     "thai2rom": ["torch>=1.0.0", "numpy>=1.16.1"],
     "wordnet": ["nltk>=3.3.*"],
     "full": [
+        "PyYAML>=5.3.1",
         "attacut>=1.0.4",
         "emoji>=0.5.1",
         "epitran>=1.1",
@@ -130,9 +131,6 @@
         "Topic :: Text Processing :: General",
         "Topic :: Text Processing :: Linguistic",
     ],
-    scripts=[
-        "bin/word-tokenization-benchmark",
-    ],
     entry_points={
         "console_scripts": [
             "thainlp = pythainlp.__main__:main",