Skip to content

Commit

Permalink
Add paragraph_tokenize
Browse files Browse the repository at this point in the history
Tokenizes text into paragraph.
  • Loading branch information
wannaphong committed Jun 5, 2023
1 parent a1957ad commit f71a099
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 12 deletions.
1 change: 1 addition & 0 deletions docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Modules

.. autofunction:: clause_tokenize
.. autofunction:: sent_tokenize
.. autofunction:: paragraph_tokenize
.. autofunction:: subword_tokenize
.. autofunction:: word_tokenize
.. autofunction:: word_detokenize
Expand Down
2 changes: 2 additions & 0 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
"subword_tokenize",
"word_tokenize",
"word_detokenize",
"paragraph_tokenize",
]

from pythainlp.corpus import thai_syllables, thai_words
Expand All @@ -46,6 +47,7 @@
subword_tokenize,
word_tokenize,
word_detokenize,
paragraph_tokenize,
)

from pythainlp.corpus import get_corpus as _get_corpus
Expand Down
55 changes: 55 additions & 0 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,61 @@ def sent_tokenize(
return segments


def paragraph_tokenize(text: str, engine: str = "wtp-mini") -> List[List[str]]:
"""
Paragraph tokenizer.
Tokenizes text into paragraph.
:param str text: text to be tokenized
:param str engine: the name paragraph tokenizer
:return: list of paragraph
:rtype: List[List[str]]
**Options for engine**
* *wtp* - split by `wtpsplitaxe <https://github.com/bminixhofer/wtpsplit>`_., \
It support many size of models. You can use ``wtp`` to use mini model, \
``wtp-tiny`` to use ``wtp-bert-tiny`` model (default), \
``wtp-mini`` to use ``wtp-bert-mini`` model, \
``wtp-base`` to use ``wtp-canine-s-1l`` model, \
and ``wtp-large`` to use ``wtp-canine-s-12l`` model.
:Example:
Split the text based on *wtp*::
from pythainlp.tokenize import paragraph_tokenize
sent = (
"(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต"
+" มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+" จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
)
paragraph_tokenize(sent)
# output: [
# ['(1) '],
# [
# 'บทความนี้ผู้เขียนสังเคราะห์ขึ้นมาจากผลงานวิจัยที่เคยทำมาในอดีต ',
# 'มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด ',
# 'จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ',
# 'ณ ที่นี้'
# ]]
"""
if engine.startswith("wtp"):
if "-" not in engine:
_size="mini"
else:
_size = engine.split("-")[-1]
from pythainlp.tokenize.wtsplit import tokenize as segment
segments = segment(text,size=_size,tokenize="paragraph")
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
It might be a typo; if not, please consult our document."""
)
return segments


def subword_tokenize(
text: str,
engine: str = DEFAULT_SUBWORD_TOKENIZE_ENGINE,
Expand Down
34 changes: 22 additions & 12 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
tltk,
oskut,
word_detokenize,
paragraph_tokenize,
)
from pythainlp.tokenize import clause_tokenize as sent_clause_tokenize
from pythainlp.util import dict_trie
Expand Down Expand Up @@ -318,18 +319,18 @@ def test_sent_tokenize(self):
engine="wtp-tiny",
),
)
self.assertIsNotNone(
sent_tokenize(
sent_3,
engine="wtp-base",
),
)
self.assertIsNotNone(
sent_tokenize(
sent_3,
engine="wtp-large",
),
)
# self.assertIsNotNone(
# sent_tokenize(
# sent_3,
# engine="wtp-base",
# ),
# )
# self.assertIsNotNone(
# sent_tokenize(
# sent_3,
# engine="wtp-large",
# ),
# )
self.assertFalse(
" "
in sent_tokenize(
Expand All @@ -341,6 +342,15 @@ def test_sent_tokenize(self):
with self.assertRaises(ValueError):
sent_tokenize("ฉันไป กิน", engine="XX") # engine does not exist

def test_paragraph_tokenize(self):
sent = (
"(1) บทความนี้ผู้เขียนสังเคราะห์ขึ้นมา"
+ "จากผลงานวิจัยที่เคยทำมาในอดีต"
+ " มิได้ทำการศึกษาค้นคว้าใหม่อย่างกว้างขวางแต่อย่างใด"
+ " จึงใคร่ขออภัยในความบกพร่องทั้งปวงมา ณ ที่นี้"
)
self.assertIsNotNone(paragraph_tokenize(sent))

def test_subword_tokenize(self):
self.assertEqual(subword_tokenize(None), [])
self.assertEqual(subword_tokenize(""), [])
Expand Down

0 comments on commit f71a099

Please sign in to comment.