Add postag of Thai Discourse Treebank

The Thai Discourse Treebank (TDTB) GitHub: https://github.com/nlp-chula/thai-discourse-treebank/tree/main Ponrawee Prasertsom, Apiwat Jaroonpol, Attapol T. Rutherford; The Thai Discourse Treebank: Annotating and Classifying Thai Discourse Connectives. Transactions of the Association for Computational Linguistics 2024; 12 613–629. doi: https://doi.org/10.1162/tacl_a_00650
PyThaiNLP · May 6, 2024 · 634a8fb · 634a8fb
1 parent a38fd5e
commit 634a8fb
Show file tree

Hide file tree

Showing 8 changed files with 48 additions and 0 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -157,3 +157,4 @@ Thanks to all [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contr
 - **[Thai Character Cluster]** -- T. Teeramunkong, V. Sornlertlamvanich, T. Tanhermhong and W. Chinnan, “Character cluster based Thai information retrieval,” in IRAL '00 Proceedings of the fifth international workshop on on Information retrieval with Asian languages, 2000.
 - **[Enhanced Thai Character Cluster]** -- Jeeragone Inrut, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. “Thai word segmentation using combination of forward and backward longest matching techniques.” In International Symposium on Communications and Information Technology (ISCIT), pp. 37-40. 2001.
 - เพ็ญศิริ ลี้ตระกูล. การเลือกประโยคสำคัญในการสรุปความภาษาไทย โดยใช้แบบจำลองแบบลำดับชั้น (Selection of Important Sentences in Thai Text Summarization Using a Hierarchical Model). Retrieved from http://digi.library.tu.ac.th/thesis/st/0192/
+- **[Thai Discourse Treebank]** -- Ponrawee Prasertsom, Apiwat Jaroonpol, Attapol T. Rutherford; The Thai Discourse Treebank: Annotating and Classifying Thai Discourse Connectives. Transactions of the Association for Computational Linguistics 2024; 12 613–629. doi: https://doi.org/10.1162/tacl_a_00650
diff --git a/pythainlp/corpus/corpus_license.md b/pythainlp/corpus/corpus_license.md
@@ -51,6 +51,8 @@ https://creativecommons.org/licenses/by/4.0/
 | pos_ud_perceptron.pkl     | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
 | pos_ud_unigram.json       | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram    |
 | sentenceseg_crfcut.model  | Sentence segmentation model, trained from TED subtitles, using CRF                                    |
+| th_tdtb-pt_tagger.json | Part-of-speech tagging model, trained from The Thai Discourse Treebank, using perceptron                              |
+| th_tdtb-unigram_tagger.json   | Part-of-speech tagging model, trained from The Thai Discourse Treebank, using unigram                                 |
 
 
 ## Thai Dictionary for ICU BreakIterator

diff --git a/pythainlp/corpus/tdtb-pt_tagger.json b/pythainlp/corpus/tdtb-pt_tagger.json
diff --git a/pythainlp/corpus/tdtb-unigram_tagger.json b/pythainlp/corpus/tdtb-unigram_tagger.json
diff --git a/pythainlp/tag/perceptron.py b/pythainlp/tag/perceptron.py
@@ -16,11 +16,15 @@
 _PUD_FILENAME = "pos_ud_perceptron-v0.2.json"
 _PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)
 
+_TDTB_FILENAME = "tdtb-pt_tagger.json"
+_TDTB_PATH = os.path.join(corpus_path(), _TDTB_FILENAME)
+
 _BLACKBOARD_NAME = "blackboard_pt_tagger"
 
 _ORCHID_TAGGER = None
 _PUD_TAGGER = None
 _BLACKBOARD_TAGGER = None
+_TDTB_TAGGER = None
 
 
 def _orchid_tagger():
@@ -44,6 +48,13 @@ def _blackboard_tagger():
     return _LST20_TAGGER
 
 
+def _tdtb():
+    global _TDTB_TAGGER
+    if not _TDTB_TAGGER:
+        _TDTB_TAGGER = PerceptronTagger(path=_TDTB_PATH)
+    return _TDTB_TAGGER
+
+
 def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
     """
     :param list words: a list of tokenized words
@@ -67,6 +78,8 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
         words = blackboard.pre_process(words)
         word_tags = _blackboard_tagger().tag(words)
         word_tags = blackboard.post_process(word_tags, to_ud)
+    elif corpus in ("tdtb"):
+        word_tags = _tdtb().tag(words)
     else:  # by default, use "pud" for corpus
         tagger = _pud_tagger()
         word_tags = tagger.tag(words)

diff --git a/pythainlp/tag/pos_tag.py b/pythainlp/tag/pos_tag.py
@@ -28,6 +28,9 @@ def pos_tag(
         * *pud* - `Parallel Universal Dependencies (PUD)\
             <https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
             treebanks, natively use Universal POS tags
+        * *tdtb* - `Thai Discourse Treebank \
+            <https://github.com/nlp-chula/thai-discourse-treebank/tree/main>`_ \
+            , natively use Universal POS tags
         * *tnc* - Thai National Corpus (support tltk engine only)
     :return: a list of tuples (word, POS tag)
     :rtype: list[tuple[str, str]]
@@ -96,6 +99,7 @@ def pos_tag(
         "orchid",
         "orchid_ud",
         "pud",
+        "tdtb",
     ]
 
     if engine == "perceptron" and corpus in _support_corpus:

diff --git a/pythainlp/tag/unigram.py b/pythainlp/tag/unigram.py
@@ -17,11 +17,15 @@
 _PUD_FILENAME = "pos_ud_unigram-v0.2.json"
 _PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)
 
+_TDTB_FILENAME = "tdtb-unigram_tagger.json"
+_TDTB_PATH = os.path.join(corpus_path(), _TDTB_FILENAME)
+
 _BLACKBOARD_NAME = "blackboard_unigram_tagger"
 
 _ORCHID_TAGGER = None
 _PUD_TAGGER = None
 _BLACKBOARD_TAGGER = None
+_TDTB_TAGGER = None
 
 
 def _orchid_tagger():
@@ -49,6 +53,14 @@ def _blackboard_tagger():
     return _BLACKBOARD_TAGGER
 
 
+def _thai_tdtb():
+    global _TDTB_TAGGER
+    if not _TDTB_TAGGER:
+        with open(_TDTB_PATH, encoding="utf-8-sig") as fh:
+            _TDTB_TAGGER = json.load(fh)
+    return _TDTB_TAGGER
+
+
 def _find_tag(
     words: List[str], dictdata: dict, default_tag: str = ""
 ) -> List[Tuple[str, str]]:
@@ -82,6 +94,8 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
         words = blackboard.pre_process(words)
         word_tags = _find_tag(words, _blackboard_tagger())
         word_tags = blackboard.post_process(word_tags, to_ud)
+    elif corpus in ("tdtb"):
+        word_tags = _find_tag(words, _thai_tdtb())
     else:  # by default, use "pud" for corpus
         word_tags = _find_tag(words, _pud_tagger())
 

diff --git a/tests/test_tag.py b/tests/test_tag.py
@@ -68,6 +68,12 @@ def test_pos_tag(self):
         self.assertIsNotNone(
             pos_tag([""], engine="unigram", corpus="blackboard_ud")
         )
+        self.assertIsNotNone(
+            pos_tag(tokens, engine="unigram", corpus="tdtb")
+        )
+        self.assertIsNotNone(
+            pos_tag([""], engine="unigram", corpus="tdtb")
+        )
         self.assertEqual(
             pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
             [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
@@ -103,6 +109,12 @@ def test_pos_tag(self):
         self.assertIsNotNone(
             pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
         )
+        self.assertIsNotNone(
+            pos_tag(tokens, engine="perceptron", corpus="tdtb")
+        )
+        self.assertIsNotNone(
+            pos_tag(tokens, engine="perceptron", corpus="tdtb")
+        )
         self.assertIsNotNone(pos_tag(tokens, engine="tltk"))
 
         self.assertEqual(pos_tag_sents(None), [])