Skip to content

Commit

Permalink
Add pythainlp.soundex.sound
Browse files Browse the repository at this point in the history
  • Loading branch information
wannaphong committed Jun 21, 2023
1 parent 61c6b8e commit 3a34d1f
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 0 deletions.
1 change: 1 addition & 0 deletions docker_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ esupar==1.3.8
ufal.chu-liu-edmonds==1.0.2
wtpsplit==1.0.1
fastcoref==2.1.6
panphon==0.20.0
3 changes: 3 additions & 0 deletions docs/api/soundex.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ Modules
.. autofunction:: udom83
.. autofunction:: metasound
.. autofunction:: prayut_and_somchaip
.. autofunction:: pythainlp.soundex.sound.word_approximation
.. autofunction:: pythainlp.soundex.sound.audio_vector
.. autofunction:: pythainlp.soundex.sound.word2audio

References
----------
Expand Down
66 changes: 66 additions & 0 deletions pythainlp/soundex/sound.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from pythainlp.transliterate import pronunciate, transliterate
from pythainlp.tokenize import word_tokenize

import panphon
import panphon.distance

_ft = panphon.FeatureTable()
_dst = panphon.distance.Distance()

def _clean_ipa(ipa: str) -> str:
"""
Clean IPA by remove tone and remove space between phone
:param str ipa: IPA text
:return: IPA that remove tone from the text
:rtype: str
"""
return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip()

def word2audio(word: str) -> str:
"""
Convert word to IPA
:param str word: Thai word
:return: IPA that remove tone from the text
:rtype: str
"""
_word = word_tokenize(word)
_phone = [pronunciate(w, engine="w2p") for w in _word]
_ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone]
return '.'.join(_ipa)

def audio_vector(word:str) -> List[List[int]]:
"""
Convert audio to vector list
"""
return _ft.word_to_vector_list(word, numeric=True)

def word_approximation(word:str, list_word:List[str]):
"""
Thai Word Approximation
:param str word: Thai word
:param str list_word: Thai word
:return: List of approximation of word (The smaller the value, the closer)
:rtype: List[str]
"""
_word = word2audio(word)
_list_word = [word2audio(w) for w in list_word]
_distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word]
return _distance
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,9 @@
"spacy>=3.0",
"fastcoref>=2.1.5",
},
"word_approximation":{
"panphon>=0.20.0"
},
"full": [
"PyYAML>=5.3.1",
"attacut>=1.0.4",
Expand Down Expand Up @@ -146,6 +149,7 @@
"spacy>=3.0",
"fastcoref>=2.1.5",
"ufal.chu-liu-edmonds>=1.0.2",
"panphon>=0.20.0",
],
}

Expand Down
7 changes: 7 additions & 0 deletions tests/test_soundex.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import unittest

from pythainlp.soundex import lk82, metasound, soundex, udom83, prayut_and_somchaip
from pythainlp.soundex.sound import word_approximation, audio_vector


class TestSoundexPackage(unittest.TestCase):
Expand Down Expand Up @@ -73,3 +74,9 @@ def test_soundex(self):
self.assertIsNotNone(prayut_and_somchaip("ณาญ"))
self.assertIsNotNone(prayut_and_somchaip("กาง"))
self.assertIsNotNone(prayut_and_somchaip("ว้าว"))

def test_word_approximation(self):
self.assertIsNotNone(word_approximation("รถ", ["รส","รด","คน"]))

def test_audio_vector(self):
self.assertIsNotNone(audio_vector("คน"))

0 comments on commit 3a34d1f

Please sign in to comment.