Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove deprecated function #687

Merged
merged 10 commits into from
Aug 8, 2022
1 change: 0 additions & 1 deletion docs/api/tokenize.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ Modules
.. autofunction:: clause_tokenize
.. autofunction:: sent_tokenize
.. autofunction:: subword_tokenize
.. autofunction:: syllable_tokenize
.. autofunction:: word_tokenize
.. autoclass:: Tokenizer
:members:
Expand Down
1 change: 0 additions & 1 deletion pythainlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@
Tokenizer,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
word_tokenize,
)
from pythainlp.transliterate import romanize, transliterate
Expand Down
14 changes: 1 addition & 13 deletions pythainlp/cli/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
DEFAULT_WORD_TOKENIZE_ENGINE,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
word_tokenize,
)

Expand Down Expand Up @@ -79,15 +78,6 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)


class SyllableTokenizationApp(SubAppBase):
def __init__(self, *args, **kwargs):
self.keep_whitespace = True
self.algorithm = DEFAULT_SYLLABLE_TOKENIZE_ENGINE
self.separator = DEFAULT_SYLLABLE_TOKEN_SEPARATOR
self.run = syllable_tokenize
super().__init__(*args, **kwargs)


class SentenceTokenizationApp(SubAppBase):
def __init__(self, *args, **kwargs):
self.keep_whitespace = True
Expand Down Expand Up @@ -132,7 +122,7 @@ def __init__(self, argv):
),
)
parser.add_argument(
"token_type", type=str, help="[subword|syllable|word|sent]",
"token_type", type=str, help="[subword|word|sent]",
)

args = parser.parse_args(argv[2:3])
Expand All @@ -142,8 +132,6 @@ def __init__(self, argv):
argv = argv[3:]
if token_type.startswith("w"):
WordTokenizationApp("word", argv)
elif token_type.startswith("sy"):
SyllableTokenizationApp("syllable", argv)
elif token_type.startswith("su"):
SubwordTokenizationApp("subword", argv)
elif token_type.startswith("se"):
Expand Down
2 changes: 0 additions & 2 deletions pythainlp/tokenize/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
"clause_tokenize",
"sent_tokenize",
"subword_tokenize",
"syllable_tokenize",
"word_tokenize",
]

Expand All @@ -31,7 +30,6 @@
clause_tokenize,
sent_tokenize,
subword_tokenize,
syllable_tokenize,
word_tokenize,
)

Expand Down
74 changes: 0 additions & 74 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,80 +422,6 @@ def subword_tokenize(
return segments


def syllable_tokenize(
text: str,
engine: str = DEFAULT_SYLLABLE_TOKENIZE_ENGINE,
keep_whitespace: bool = True,
) -> List[str]:
"""
Syllable tokenizer.

**syllable_tokenize is deprecated, use subword_tokenize instead**

Tokenizes text into syllable (Thai: พยางค์), a unit of
pronunciation having one vowel sound. For example, the word 'รถไฟ'
contains two syallbles including 'รถ', and 'ไฟ'.

Under the hood, this function uses :func:`pythainlp.tokenize.word_tokenize`
with *newmm* as a tokenizer. The function tokenize the text with
the dictionary of Thai words from
:func:`pythainlp.corpus.common.thai_words`
and then dictionary of Thai syllable from
:func:`pythainlp.corpus.common.thai_syllables`.
As a result, only syllables are obtained.

:param str text: input string to be tokenized
:param str engine: name of the syllable tokenizer
:return: list of syllables where whitespaces in the text **are included**
:rtype: list[str]
**Options for engine**
* *dict* (default) - newmm word tokenizer with a syllable dictionary
* *ssg* - CRF syllable segmenter for Thai
:Example::
::

from pythainlp.tokenize import syllable_tokenize

text = 'รถไฟสมัยใหม่จะใช้กำลังจากหัวรถจักรดีเซล หรือจากไฟฟ้า'
syllable_tokenize(text)
['รถ', 'ไฟ', 'สมัย', 'ใหม่', 'ใช้', 'กำ', 'ลัง', 'จาก', 'หัว',
'รถ', 'จักร', 'ดี', 'เซล', ' ', 'หรือ', 'จาก', 'ไฟ', 'ฟ้า']
"""
warnings.warn(
"""syllable_tokenize will be deprecated in PyThaiNLP version 3.1,
use subword_tokenize instead""",
PendingDeprecationWarning
)

if not text or not isinstance(text, str):
return []

segments = []

if engine == "dict" or engine == "default": # use syllable dictionary
words = word_tokenize(text)
for word in words:
segments.extend(
word_tokenize(
text=word, custom_dict=DEFAULT_SYLLABLE_DICT_TRIE
)
)
elif engine == "ssg":
from pythainlp.tokenize.ssg import segment

segments = segment(text)
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
It might be a typo; if not, please consult our document."""
)

if not keep_whitespace:
segments = [token.strip(" ") for token in segments if token.strip(" ")]

return segments


class Tokenizer:
"""
Tokenizer class, for a custom tokenizer.
Expand Down
5 changes: 1 addition & 4 deletions pythainlp/util/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
"bahttext",
"collate",
"countthai",
"delete_tone",
"dict_trie",
"digit_to_text",
"display_thai_char",
Expand All @@ -35,7 +34,6 @@
"thai_digit_to_arabic_digit",
"thai_keyboard_dist",
"thai_strftime",
"thai_time",
"thai_to_eng",
"thaiword_to_date",
"thaiword_to_num",
Expand Down Expand Up @@ -68,7 +66,6 @@
from pythainlp.util.keyboard import eng_to_thai, thai_to_eng
from pythainlp.util.keywords import find_keyword, rank
from pythainlp.util.normalize import (
delete_tone,
normalize,
maiyamok,
remove_dangling,
Expand All @@ -87,7 +84,7 @@
isthaichar,
)
from pythainlp.util.thaiwordcheck import is_native_thai
from pythainlp.util.time import thai_time, thaiword_to_time, time_to_thaiword
from pythainlp.util.time import thaiword_to_time, time_to_thaiword
from pythainlp.util.trie import Trie, dict_trie
from pythainlp.util.wordtonum import thaiword_to_num, text_to_num, words_to_num
from pythainlp.util.syllable import sound_syllable
15 changes: 2 additions & 13 deletions pythainlp/util/normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ def remove_tonemark(text: str) -> str:
:Example:
::

from pythainlp.util import delete_tone
from pythainlp.util import remove_tonemark

delete_tone('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
remove_tonemark('สองพันหนึ่งร้อยสี่สิบเจ็ดล้านสี่แสนแปดหมื่นสามพันหกร้อยสี่สิบเจ็ด')
# output: สองพันหนึงรอยสีสิบเจ็ดลานสีแสนแปดหมืนสามพันหกรอยสีสิบเจ็ด
"""
for ch in tonemarks:
Expand Down Expand Up @@ -248,17 +248,6 @@ def normalize(text: str) -> str:
return text


def delete_tone(text: str) -> str:
"""
DEPRECATED: Please use remove_tonemark().
"""
warnings.warn(
"delete_tone is deprecated, use remove_tonemark instead",
DeprecationWarning,
)
return remove_tonemark(text)


def maiyamok(sent: Union[str, List[str]]) -> List[str]:
"""
Thai MaiYaMok
Expand Down
15 changes: 0 additions & 15 deletions pythainlp/util/time.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,21 +230,6 @@ def time_to_thaiword(
return text


def thai_time(
time_data: Union[time, datetime, str],
fmt: str = "24h",
precision: Union[str, None] = None,
) -> str:
"""
DEPRECATED: Please use time_to_thaiword().
"""
warnings.warn(
"thai_time is deprecated, use time_to_thaiword instead",
DeprecationWarning,
)
return time_to_thaiword(time_data, fmt, precision)


def thaiword_to_time(text: str, padding: bool = True) -> str:
"""
Convert Thai time in words into time (H:M).
Expand Down
10 changes: 0 additions & 10 deletions pythainlp/word_vector/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,9 @@
Initial code from https://github.com/cstorm125/thai2fit
"""
__all__ = [
"doesnt_match",
"get_model",
"most_similar_cosmul",
"sentence_vectorizer",
"similarity",
"WordVector",
]

from pythainlp.word_vector.core import (
doesnt_match,
get_model,
most_similar_cosmul,
sentence_vectorizer,
similarity,
WordVector,
)
Loading