Skip to content

Commit

Permalink
Add postag of Thai Discourse Treebank
Browse files Browse the repository at this point in the history
The Thai Discourse Treebank (TDTB)

GitHub: https://github.com/nlp-chula/thai-discourse-treebank/tree/main

Ponrawee Prasertsom, Apiwat Jaroonpol, Attapol T. Rutherford; The Thai Discourse Treebank: Annotating and Classifying Thai Discourse Connectives. Transactions of the Association for Computational Linguistics 2024; 12 613–629. doi: https://doi.org/10.1162/tacl_a_00650
  • Loading branch information
wannaphong committed May 6, 2024
1 parent a38fd5e commit 634a8fb
Show file tree
Hide file tree
Showing 8 changed files with 48 additions and 0 deletions.
1 change: 1 addition & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,4 @@ Thanks to all [contributors](https://github.com/PyThaiNLP/pythainlp/graphs/contr
- **[Thai Character Cluster]** -- T. Teeramunkong, V. Sornlertlamvanich, T. Tanhermhong and W. Chinnan, “Character cluster based Thai information retrieval,” in IRAL '00 Proceedings of the fifth international workshop on on Information retrieval with Asian languages, 2000.
- **[Enhanced Thai Character Cluster]** -- Jeeragone Inrut, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth. “Thai word segmentation using combination of forward and backward longest matching techniques.” In International Symposium on Communications and Information Technology (ISCIT), pp. 37-40. 2001.
- เพ็ญศิริ ลี้ตระกูล. การเลือกประโยคสำคัญในการสรุปความภาษาไทย โดยใช้แบบจำลองแบบลำดับชั้น (Selection of Important Sentences in Thai Text Summarization Using a Hierarchical Model). Retrieved from http://digi.library.tu.ac.th/thesis/st/0192/
- **[Thai Discourse Treebank]** -- Ponrawee Prasertsom, Apiwat Jaroonpol, Attapol T. Rutherford; The Thai Discourse Treebank: Annotating and Classifying Thai Discourse Connectives. Transactions of the Association for Computational Linguistics 2024; 12 613–629. doi: https://doi.org/10.1162/tacl_a_00650
2 changes: 2 additions & 0 deletions pythainlp/corpus/corpus_license.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ https://creativecommons.org/licenses/by/4.0/
| pos_ud_perceptron.pkl | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using perceptron |
| pos_ud_unigram.json | Part-of-speech tagging model, trained from Parallel Universal Dependencies treebank, using unigram |
| sentenceseg_crfcut.model | Sentence segmentation model, trained from TED subtitles, using CRF |
| th_tdtb-pt_tagger.json | Part-of-speech tagging model, trained from The Thai Discourse Treebank, using perceptron |
| th_tdtb-unigram_tagger.json | Part-of-speech tagging model, trained from The Thai Discourse Treebank, using unigram |


## Thai Dictionary for ICU BreakIterator
Expand Down
1 change: 1 addition & 0 deletions pythainlp/corpus/tdtb-pt_tagger.json

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pythainlp/corpus/tdtb-unigram_tagger.json

Large diffs are not rendered by default.

13 changes: 13 additions & 0 deletions pythainlp/tag/perceptron.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,15 @@
_PUD_FILENAME = "pos_ud_perceptron-v0.2.json"
_PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)

_TDTB_FILENAME = "tdtb-pt_tagger.json"
_TDTB_PATH = os.path.join(corpus_path(), _TDTB_FILENAME)

_BLACKBOARD_NAME = "blackboard_pt_tagger"

_ORCHID_TAGGER = None
_PUD_TAGGER = None
_BLACKBOARD_TAGGER = None
_TDTB_TAGGER = None


def _orchid_tagger():
Expand All @@ -44,6 +48,13 @@ def _blackboard_tagger():
return _LST20_TAGGER


def _tdtb():
global _TDTB_TAGGER
if not _TDTB_TAGGER:
_TDTB_TAGGER = PerceptronTagger(path=_TDTB_PATH)
return _TDTB_TAGGER


def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
"""
:param list words: a list of tokenized words
Expand All @@ -67,6 +78,8 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
words = blackboard.pre_process(words)
word_tags = _blackboard_tagger().tag(words)
word_tags = blackboard.post_process(word_tags, to_ud)
elif corpus in ("tdtb"):
word_tags = _tdtb().tag(words)
else: # by default, use "pud" for corpus
tagger = _pud_tagger()
word_tags = tagger.tag(words)
Expand Down
4 changes: 4 additions & 0 deletions pythainlp/tag/pos_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ def pos_tag(
* *pud* - `Parallel Universal Dependencies (PUD)\
<https://github.com/UniversalDependencies/UD_Thai-PUD>`_ \
treebanks, natively use Universal POS tags
* *tdtb* - `Thai Discourse Treebank \
<https://github.com/nlp-chula/thai-discourse-treebank/tree/main>`_ \
, natively use Universal POS tags
* *tnc* - Thai National Corpus (support tltk engine only)
:return: a list of tuples (word, POS tag)
:rtype: list[tuple[str, str]]
Expand Down Expand Up @@ -96,6 +99,7 @@ def pos_tag(
"orchid",
"orchid_ud",
"pud",
"tdtb",
]

if engine == "perceptron" and corpus in _support_corpus:
Expand Down
14 changes: 14 additions & 0 deletions pythainlp/tag/unigram.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,15 @@
_PUD_FILENAME = "pos_ud_unigram-v0.2.json"
_PUD_PATH = os.path.join(corpus_path(), _PUD_FILENAME)

_TDTB_FILENAME = "tdtb-unigram_tagger.json"
_TDTB_PATH = os.path.join(corpus_path(), _TDTB_FILENAME)

_BLACKBOARD_NAME = "blackboard_unigram_tagger"

_ORCHID_TAGGER = None
_PUD_TAGGER = None
_BLACKBOARD_TAGGER = None
_TDTB_TAGGER = None


def _orchid_tagger():
Expand Down Expand Up @@ -49,6 +53,14 @@ def _blackboard_tagger():
return _BLACKBOARD_TAGGER


def _thai_tdtb():
global _TDTB_TAGGER
if not _TDTB_TAGGER:
with open(_TDTB_PATH, encoding="utf-8-sig") as fh:
_TDTB_TAGGER = json.load(fh)
return _TDTB_TAGGER


def _find_tag(
words: List[str], dictdata: dict, default_tag: str = ""
) -> List[Tuple[str, str]]:
Expand Down Expand Up @@ -82,6 +94,8 @@ def tag(words: List[str], corpus: str = "pud") -> List[Tuple[str, str]]:
words = blackboard.pre_process(words)
word_tags = _find_tag(words, _blackboard_tagger())
word_tags = blackboard.post_process(word_tags, to_ud)
elif corpus in ("tdtb"):
word_tags = _find_tag(words, _thai_tdtb())
else: # by default, use "pud" for corpus
word_tags = _find_tag(words, _pud_tagger())

Expand Down
12 changes: 12 additions & 0 deletions tests/test_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ def test_pos_tag(self):
self.assertIsNotNone(
pos_tag([""], engine="unigram", corpus="blackboard_ud")
)
self.assertIsNotNone(
pos_tag(tokens, engine="unigram", corpus="tdtb")
)
self.assertIsNotNone(
pos_tag([""], engine="unigram", corpus="tdtb")
)
self.assertEqual(
pos_tag(["คุณ", "กำลัง", "ประชุม"], engine="unigram"),
[("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")],
Expand Down Expand Up @@ -103,6 +109,12 @@ def test_pos_tag(self):
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="blackboard_ud")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="tdtb")
)
self.assertIsNotNone(
pos_tag(tokens, engine="perceptron", corpus="tdtb")
)
self.assertIsNotNone(pos_tag(tokens, engine="tltk"))

self.assertEqual(pos_tag_sents(None), [])
Expand Down

0 comments on commit 634a8fb

Please sign in to comment.