Skip to content

Commit

Permalink
Merge pull request #687 from PyThaiNLP/remove-deprecated-function
Browse files Browse the repository at this point in the history
Remove deprecated function
  • Loading branch information
wannaphong authored Aug 8, 2022
2 parents c6c8824 + 7ac06ff commit 6a88f6f
Show file tree
Hide file tree
Showing 13 changed files with 5 additions and 473 deletions.
1 change: 0 additions & 1 deletion docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ Modules
.. autofunction:: clause_tokenize
.. autofunction:: sent_tokenize
.. autofunction:: subword_tokenize
.. autofunction:: syllable_tokenize
.. autofunction:: word_tokenize
.. autoclass:: Tokenizer
:members:
Expand Down
1 change: 0 additions & 1 deletion pythainlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
Tokenizer,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
word_tokenize,
)
from pythainlp.transliterate import romanize, transliterate
Expand Down
14 changes: 1 addition & 13 deletions pythainlp/cli/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
DEFAULT_WORD_TOKENIZE_ENGINE,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
word_tokenize,
)

Expand Down Expand Up @@ -79,15 +78,6 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)


class SyllableTokenizationApp(SubAppBase):
def __init__(self, *args, **kwargs):
self.keep_whitespace = True
self.algorithm = DEFAULT_SYLLABLE_TOKENIZE_ENGINE
self.separator = DEFAULT_SYLLABLE_TOKEN_SEPARATOR
self.run = syllable_tokenize
super().__init__(*args, **kwargs)


class SentenceTokenizationApp(SubAppBase):
def __init__(self, *args, **kwargs):
self.keep_whitespace = True
Expand Down Expand Up @@ -132,7 +122,7 @@ def __init__(self, argv):
),
)
parser.add_argument(
"token_type", type=str, help="[subword|syllable|word|sent]",
"token_type", type=str, help="[subword|word|sent]",
)

args = parser.parse_args(argv[2:3])
Expand All @@ -142,8 +132,6 @@ def __init__(self, argv):
argv = argv[3:]
if token_type.startswith("w"):
WordTokenizationApp("word", argv)
elif token_type.startswith("sy"):
SyllableTokenizationApp("syllable", argv)
elif token_type.startswith("su"):
SubwordTokenizationApp("subword", argv)
elif token_type.startswith("se"):
Expand Down
2 changes: 0 additions & 2 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
"clause_tokenize",
"sent_tokenize",
"subword_tokenize",
"syllable_tokenize",
"word_tokenize",
]

Expand All @@ -31,7 +30,6 @@
clause_tokenize,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
word_tokenize,
)

Expand Down
74 changes: 0 additions & 74 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,80 +429,6 @@ def subword_tokenize(
return segments


def syllable_tokenize(
text: str,
engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
keep_whitespace: bool = True,
) -> List[str]:
"""
Syllable tokenizer.
**syllable_tokenize is deprecated, use subword_tokenize instead**
Tokenizes text into syllable (Thai: พยางค์), a unit of
pronunciation having one vowel sound. For example, the word 'รถไฟ'
contains two syallbles including 'รถ', and 'ไฟ'.
Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize`
with *newmm* as a tokenizer. The function tokenize the text with
the dictionary of Thai words from
:func:`pythainlp.corpus.common.thai_words`
and then dictionary of Thai syllable from
:func:`pythainlp.corpus.common.thai_syllables`.
As a result, only syllables are obtained.
:param str text: input string to be tokenized
:param str engine: name of the syllable tokenizer
:return: list of syllables where whitespaces in the text **are included**
:rtype: list[str]
**Options for engine**
* *dict* (default) - newmm word tokenizer with a syllable dictionary
* *ssg* - CRF syllable segmenter for Thai
:Example::
::
from pythainlp.tokenize import syllable_tokenize
text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า'
syllable_tokenize(text)
['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
"""
warnings.warn(
"""syllable_tokenize will be deprecated in PyThaiNLP version 3.1,
use subword_tokenize instead""",
PendingDeprecationWarning
)

if not text or not isinstance(text, str):
return []

segments = []

if engine == "dict" or engine == "default": # use syllable dictionary
words = word_tokenize(text)
for word in words:
segments.extend(
word_tokenize(
text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE
)
)
elif engine == "ssg":
from pythainlp.tokenize.ssg import segment

segments = segment(text)
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
It might be a typo; if not, please consult our document."""
)

if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]

return segments


class Tokenizer:
"""
Tokenizer class, for a custom tokenizer.
Expand Down
5 changes: 1 addition & 4 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"bahttext",
"collate",
"countthai",
"delete_tone",
"dict_trie",
"digit_to_text",
"display_thai_char",
Expand All @@ -35,7 +34,6 @@
"thai_digit_to_arabic_digit",
"thai_keyboard_dist",
"thai_strftime",
"thai_time",
"thai_to_eng",
"thai_word_tone_detector",
"thaiword_to_date",
Expand Down Expand Up @@ -72,7 +70,6 @@
from pythainlp.util.keyboard import eng_to_thai, thai_to_eng
from pythainlp.util.keywords import find_keyword, rank
from pythainlp.util.normalize import (
delete_tone,
normalize,
maiyamok,
remove_dangling,
Expand All @@ -92,7 +89,7 @@
thai_word_tone_detector,
)
from pythainlp.util.thaiwordcheck import is_native_thai
from pythainlp.util.time import thai_time, thaiword_to_time, time_to_thaiword
from pythainlp.util.time import thaiword_to_time, time_to_thaiword
from pythainlp.util.trie import Trie, dict_trie
from pythainlp.util.wordtonum import thaiword_to_num, text_to_num, words_to_num
from pythainlp.util.syllable import (
Expand Down
15 changes: 2 additions & 13 deletions pythainlp/util/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ def remove_tonemark(text: str) -> str:
:Example:
::
from pythainlp.util import delete_tone
from pythainlp.util import remove_tonemark
delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
remove_tonemark('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
# output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
"""
for ch in tonemarks:
Expand Down Expand Up @@ -248,17 +248,6 @@ def normalize(text: str) -> str:
return text


def delete_tone(text: str) -> str:
"""
DEPRECATED: Please use remove_tonemark().
"""
warnings.warn(
"delete_tone is deprecated, use remove_tonemark instead",
DeprecationWarning,
)
return remove_tonemark(text)


def maiyamok(sent: Union[str, List[str]]) -> List[str]:
"""
Thai MaiYaMok
Expand Down
15 changes: 0 additions & 15 deletions pythainlp/util/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,21 +230,6 @@ def time_to_thaiword(
return text


def thai_time(
time_data: Union[time, datetime, str],
fmt: str = "24h",
precision: Union[str, None] = None,
) -> str:
"""
DEPRECATED: Please use time_to_thaiword().
"""
warnings.warn(
"thai_time is deprecated, use time_to_thaiword instead",
DeprecationWarning,
)
return time_to_thaiword(time_data, fmt, precision)


def thaiword_to_time(text: str, padding: bool = True) -> str:
"""
Convert Thai time in words into time (H:M).
Expand Down
10 changes: 0 additions & 10 deletions pythainlp/word_vector/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,9 @@
Initial code from https://github.com/cstorm125/thai2fit
"""
__all__ = [
"doesnt_match",
"get_model",
"most_similar_cosmul",
"sentence_vectorizer",
"similarity",
"WordVector",
]

from pythainlp.word_vector.core import (
doesnt_match,
get_model,
most_similar_cosmul,
sentence_vectorizer,
similarity,
WordVector,
)
Loading

0 comments on commit 6a88f6f

Please sign in to comment.