From 3a34d1f6c727ec03e4fdfe3ca5939711fb69c072 Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Wed, 21 Jun 2023 15:17:37 +0700 Subject: [PATCH] Add pythainlp.soundex.sound --- docker_requirements.txt | 1 + docs/api/soundex.rst | 3 ++ pythainlp/soundex/sound.py | 66 ++++++++++++++++++++++++++++++++++++++ setup.py | 4 +++ tests/test_soundex.py | 7 ++++ 5 files changed, 81 insertions(+) create mode 100644 pythainlp/soundex/sound.py diff --git a/docker_requirements.txt b/docker_requirements.txt index 72fe9e02e..4cd8b63f9 100644 --- a/docker_requirements.txt +++ b/docker_requirements.txt @@ -36,3 +36,4 @@ esupar==1.3.8 ufal.chu-liu-edmonds==1.0.2 wtpsplit==1.0.1 fastcoref==2.1.6 +panphon==0.20.0 diff --git a/docs/api/soundex.rst b/docs/api/soundex.rst index 3c8915f24..139fadd02 100644 --- a/docs/api/soundex.rst +++ b/docs/api/soundex.rst @@ -12,6 +12,9 @@ Modules .. autofunction:: udom83 .. autofunction:: metasound .. autofunction:: prayut_and_somchaip +.. autofunction:: pythainlp.soundex.sound.word_approximation +.. autofunction:: pythainlp.soundex.sound.audio_vector +.. autofunction:: pythainlp.soundex.sound.word2audio References ---------- diff --git a/pythainlp/soundex/sound.py b/pythainlp/soundex/sound.py new file mode 100644 index 000000000..f28dceeb6 --- /dev/null +++ b/pythainlp/soundex/sound.py @@ -0,0 +1,66 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2016-2023 PyThaiNLP Project +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List +from pythainlp.transliterate import pronunciate, transliterate +from pythainlp.tokenize import word_tokenize + +import panphon +import panphon.distance + +_ft = panphon.FeatureTable() +_dst = panphon.distance.Distance() + +def _clean_ipa(ipa: str) -> str: + """ + Clean IPA by remove tone and remove space between phone + + :param str ipa: IPA text + :return: IPA that remove tone from the text + :rtype: str + """ + return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip() + +def word2audio(word: str) -> str: + """ + Convert word to IPA + + :param str word: Thai word + :return: IPA that remove tone from the text + :rtype: str + """ + _word = word_tokenize(word) + _phone = [pronunciate(w, engine="w2p") for w in _word] + _ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone] + return '.'.join(_ipa) + +def audio_vector(word:str) -> List[List[int]]: + """ + Convert audio to vector list + """ + return _ft.word_to_vector_list(word, numeric=True) + +def word_approximation(word:str, list_word:List[str]): + """ + Thai Word Approximation + + :param str word: Thai word + :param str list_word: Thai word + :return: List of approximation of word (The smaller the value, the closer) + :rtype: List[str] + """ + _word = word2audio(word) + _list_word = [word2audio(w) for w in list_word] + _distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word] + return _distance diff --git a/setup.py b/setup.py index 10ca6b107..e9e540732 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,9 @@ "spacy>=3.0", "fastcoref>=2.1.5", }, + "word_approximation":{ + "panphon>=0.20.0" + }, "full": [ "PyYAML>=5.3.1", "attacut>=1.0.4", @@ -146,6 +149,7 @@ "spacy>=3.0", "fastcoref>=2.1.5", "ufal.chu-liu-edmonds>=1.0.2", + "panphon>=0.20.0", ], } diff --git a/tests/test_soundex.py b/tests/test_soundex.py index b012afc74..7bf00af7d 100644 --- a/tests/test_soundex.py +++ b/tests/test_soundex.py @@ -3,6 +3,7 @@ import unittest from pythainlp.soundex import lk82, metasound, soundex, udom83, prayut_and_somchaip +from pythainlp.soundex.sound import word_approximation, audio_vector class TestSoundexPackage(unittest.TestCase): @@ -73,3 +74,9 @@ def test_soundex(self): self.assertIsNotNone(prayut_and_somchaip("ณาญ")) self.assertIsNotNone(prayut_and_somchaip("กาง")) self.assertIsNotNone(prayut_and_somchaip("ว้าว")) + + def test_word_approximation(self): + self.assertIsNotNone(word_approximation("รถ", ["รส","รด","คน"])) + + def test_audio_vector(self): + self.assertIsNotNone(audio_vector("คน"))