From 3a34d1f6c727ec03e4fdfe3ca5939711fb69c072 Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun <wannaphong@yahoo.com>
Date: Wed, 21 Jun 2023 15:17:37 +0700
Subject: [PATCH] Add pythainlp.soundex.sound

---
 docker_requirements.txt    |  1 +
 docs/api/soundex.rst       |  3 ++
 pythainlp/soundex/sound.py | 66 ++++++++++++++++++++++++++++++++++++++
 setup.py                   |  4 +++
 tests/test_soundex.py      |  7 ++++
 5 files changed, 81 insertions(+)
 create mode 100644 pythainlp/soundex/sound.py

diff --git a/docker_requirements.txt b/docker_requirements.txt
index 72fe9e02e..4cd8b63f9 100644
--- a/docker_requirements.txt
+++ b/docker_requirements.txt
@@ -36,3 +36,4 @@ esupar==1.3.8
 ufal.chu-liu-edmonds==1.0.2
 wtpsplit==1.0.1
 fastcoref==2.1.6
+panphon==0.20.0
diff --git a/docs/api/soundex.rst b/docs/api/soundex.rst
index 3c8915f24..139fadd02 100644
--- a/docs/api/soundex.rst
+++ b/docs/api/soundex.rst
@@ -12,6 +12,9 @@ Modules
 .. autofunction:: udom83
 .. autofunction:: metasound
 .. autofunction:: prayut_and_somchaip
+.. autofunction:: pythainlp.soundex.sound.word_approximation
+.. autofunction:: pythainlp.soundex.sound.audio_vector
+.. autofunction:: pythainlp.soundex.sound.word2audio
 
 References
 ----------
diff --git a/pythainlp/soundex/sound.py b/pythainlp/soundex/sound.py
new file mode 100644
index 000000000..f28dceeb6
--- /dev/null
+++ b/pythainlp/soundex/sound.py
@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+from pythainlp.transliterate import pronunciate, transliterate
+from pythainlp.tokenize import word_tokenize
+
+import panphon
+import panphon.distance
+
+_ft = panphon.FeatureTable()
+_dst = panphon.distance.Distance()
+
+def _clean_ipa(ipa: str) -> str:
+    """
+    Clean IPA by remove tone and remove space between phone
+
+    :param str ipa: IPA text
+    :return: IPA that remove tone from the text
+    :rtype: str
+    """
+    return ipa.replace("˩˩˦","").replace("˥˩","").replace("˨˩","").replace("˦˥","").replace("˧","").replace("˧","").replace(" .",".").replace(". ",".").strip()
+
+def word2audio(word: str) -> str:
+    """
+    Convert word to IPA
+
+    :param str word: Thai word
+    :return: IPA that remove tone from the text
+    :rtype: str
+    """
+    _word = word_tokenize(word)
+    _phone = [pronunciate(w, engine="w2p") for w in _word]
+    _ipa = [_clean_ipa(transliterate(phone, engine="thaig2p")) for phone in _phone]
+    return '.'.join(_ipa)
+
+def audio_vector(word:str) -> List[List[int]]:
+    """
+    Convert audio to vector list
+    """
+    return _ft.word_to_vector_list(word, numeric=True)
+
+def word_approximation(word:str, list_word:List[str]):
+    """
+    Thai Word Approximation
+
+    :param str word: Thai word
+    :param str list_word: Thai word
+    :return: List of approximation of word (The smaller the value, the closer)
+    :rtype: List[str]
+    """
+    _word = word2audio(word)
+    _list_word = [word2audio(w) for w in list_word]
+    _distance = [_dst.weighted_feature_edit_distance(_word, w) for w in _list_word]
+    return _distance
diff --git a/setup.py b/setup.py
index 10ca6b107..e9e540732 100644
--- a/setup.py
+++ b/setup.py
@@ -114,6 +114,9 @@
         "spacy>=3.0",
         "fastcoref>=2.1.5",
     },
+    "word_approximation":{
+        "panphon>=0.20.0"
+    },
     "full": [
         "PyYAML>=5.3.1",
         "attacut>=1.0.4",
@@ -146,6 +149,7 @@
         "spacy>=3.0",
         "fastcoref>=2.1.5",
         "ufal.chu-liu-edmonds>=1.0.2",
+        "panphon>=0.20.0",
     ],
 }
 
diff --git a/tests/test_soundex.py b/tests/test_soundex.py
index b012afc74..7bf00af7d 100644
--- a/tests/test_soundex.py
+++ b/tests/test_soundex.py
@@ -3,6 +3,7 @@
 import unittest
 
 from pythainlp.soundex import lk82, metasound, soundex, udom83, prayut_and_somchaip
+from pythainlp.soundex.sound import word_approximation, audio_vector
 
 
 class TestSoundexPackage(unittest.TestCase):
@@ -73,3 +74,9 @@ def test_soundex(self):
         self.assertIsNotNone(prayut_and_somchaip("ณาญ"))
         self.assertIsNotNone(prayut_and_somchaip("กาง"))
         self.assertIsNotNone(prayut_and_somchaip("ว้าว"))
+
+    def test_word_approximation(self):
+        self.assertIsNotNone(word_approximation("รถ", ["รส","รด","คน"]))
+
+    def test_audio_vector(self):
+        self.assertIsNotNone(audio_vector("คน"))