Skip to content

Commit

Permalink
Merge pull request #890 from PyThaiNLP/add-find_synonym
Browse files Browse the repository at this point in the history
Add pythainlp.corpus.find_synonyms
  • Loading branch information
bact committed Dec 12, 2023
2 parents 3b6daf0 + d9aa851 commit fcef21c
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 0 deletions.
5 changes: 5 additions & 0 deletions docs/api/corpus.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@ countries
.. autofunction:: countries
:noindex:

find_synonym
~~~~~~~~~~~~
.. autofunction:: find_synonym
:noindex:

get_corpus
~~~~~~~~~~
.. autofunction:: get_corpus
Expand Down
2 changes: 2 additions & 0 deletions pythainlp/corpus/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
"corpus_path",
"countries",
"download",
"find_synonyms",
"get_corpus",
"get_corpus_as_is",
"get_corpus_db",
Expand Down Expand Up @@ -101,6 +102,7 @@ def corpus_db_path() -> str:
) # these imports must come before other pythainlp.corpus.* imports
from pythainlp.corpus.common import (
countries,
find_synonyms,
provinces,
thai_dict,
thai_family_names,
Expand Down
37 changes: 37 additions & 0 deletions pythainlp/corpus/common.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
# -*- coding: utf-8 -*-
# SPDX-FileCopyrightText: Copyright 2016-2023 PyThaiNLP Project
# SPDX-License-Identifier: Apache-2.0

"""
Common lists of words.
"""

__all__ = [
"countries",
"find_synonyms",
"provinces",
"thai_family_names",
"thai_female_names",
Expand Down Expand Up @@ -336,3 +338,38 @@ def thai_synonyms() -> dict:
def thai_synonym() -> dict:
warnings.warn("Deprecated: Use thai_synonyms() instead.", DeprecationWarning)
return thai_synonyms()


def find_synonyms(word: str) -> List[str]:
"""
Find synonyms
:param str word: Thai word
:return: List of synonyms of the input word or an empty list if it isn't exist.
:rtype: List[str]
:Example:
::
from pythainlp.corpus import find_synonyms
print(find_synonyms("หมู"))
# output: ['จรุก', 'วราหะ', 'วราห์', 'ศูกร', 'สุกร']
"""
synonyms = thai_synonyms() # get a dictionary of {word, synonym}
list_synonym = []

if word in synonyms["word"]: # find by word
list_synonym.extend(synonyms["synonym"][synonyms["word"].index(word)])

for idx, words in enumerate(synonyms["synonym"]): # find by synonym
if word in words:
list_synonym.extend(synonyms["synonym"][idx])
list_synonym.append(synonyms["word"][idx])

list_synonym = sorted(list(set(list_synonym)))

if word in list_synonym: # remove same word
list_synonym.remove(word)

return list_synonym
8 changes: 8 additions & 0 deletions tests/test_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
conceptnet,
countries,
download,
find_synonyms,
get_corpus_db,
get_corpus_db_detail,
get_corpus_default_db,
Expand Down Expand Up @@ -204,3 +205,10 @@ def test_zip(self):
p = get_corpus_path("test_zip")
self.assertEqual(os.path.isdir(p), True)
self.assertEqual(remove("test_zip"), True)

def test_find_synonyms(self):
self.assertEqual(
find_synonyms("หมู"),
['จรุก', 'วราหะ', 'วราห์', 'ศูกร', 'สุกร']
)
self.assertEqual(find_synonyms("1"), [])

0 comments on commit fcef21c

Please sign in to comment.