Add 🪿 Han-solo

🪿 Han-solo: Thai syllable segmenter This work wants to create a Thai syllable segmenter that can work in the Thai social media domain. GitHub: https://github.com/PyThaiNLP/Han-solo
PyThaiNLP · Jul 30, 2023 · 10dcbac · 10dcbac
1 parent cc9139c
commit 10dcbac
Show file tree

Hide file tree

Showing 4 changed files with 159 additions and 2 deletions.
diff --git a/pythainlp/corpus/han_solo.crfsuite b/pythainlp/corpus/han_solo.crfsuite
diff --git a/pythainlp/tokenize/core.py b/pythainlp/tokenize/core.py
@@ -530,10 +530,15 @@ def subword_tokenize(
     **Options for engine**
         * *dict* - newmm word tokenizer with a syllable dictionary
         * *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
-        * *ssg* - CRF syllable segmenter for Thai
+        * *han_solo* - CRF syllable segmenter for Thai that can work in the \
+            Thai social media domain. See `PyThaiNLP/Han-solo \
+        <https://github.com/PyThaiNLP/Han-solo>`_.
+        * *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \
+        <https://github.com/ponrawee/ssg>`_.
         * *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
         * *tcc_p* - Thai Character Cluster + improve the rule that used in newmm
-        * *tltk* - syllable tokenizer from tltk
+        * *tltk* - syllable tokenizer from tltk. See `tltk \
+        <https://pypi.org/project/tltk/>`_.
         * *wangchanberta* - SentencePiece from wangchanberta model
     :Example:
 
@@ -600,6 +605,8 @@ def subword_tokenize(
         from pythainlp.tokenize.ssg import segment
     elif engine == "tltk":
         from pythainlp.tokenize.tltk import syllable_tokenize as segment
+    elif engine == "han_solo":
+        from pythainlp.tokenize.han_solo import segment
     else:
         raise ValueError(
             f"""Tokenizer \"{engine}\" not found.

diff --git a/pythainlp/tokenize/han_solo.py b/pythainlp/tokenize/han_solo.py
@@ -0,0 +1,140 @@
+# -*- coding: utf-8 -*-
+# Copyright (C) 2016-2023 PyThaiNLP Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+from pythainlp.corpus import path_pythainlp_corpus
+try:
+    import pycrfsuite
+except ImportError:
+    raise ImportError("ImportError; Install pycrfsuite by pip install python-crfsuite")
+
+tagger = pycrfsuite.Tagger()
+tagger.open(path_pythainlp_corpus('han_solo.crfsuite'))
+
+
+class Featurizer:
+#  This class from ssg at https://github.com/ponrawee/ssg.
+#    Copyright 2019 Ponrawee Prasertsom
+
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+
+#        http://www.apache.org/licenses/LICENSE-2.0
+
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+#     { 
+#      "0 (current anchor)|+1 (the character on the right from anchor)|A (character)" : 1
+#     }
+
+    def __init__(self, N=2, sequence_size=1, delimiter=None):
+        self.N = N
+        self.delimiter = delimiter
+        self.radius = N + sequence_size
+        pass
+
+    def pad(self, sentence, padder='#'):
+        return padder * (self.radius) + sentence + padder * (self.radius)
+
+    def featurize(self, sentence, padding=True, indiv_char=True, return_type='list'):
+        if padding:
+            sentence = self.pad(sentence)
+        all_features = []
+        all_labels = []
+        skip_next = False
+        for current_position in range(self.radius, len(sentence) - self.radius + 1):
+            if skip_next:
+                skip_next = False
+                continue
+            features = {}
+            if return_type == 'list':
+                features = []
+            cut = 0
+            char = sentence[current_position]
+            if char == self.delimiter:
+                cut = 1
+                skip_next = True
+            counter = 0
+            chars_left = ''
+            chars_right = ''
+            chars = ''
+            abs_index_left = current_position # left start at -1
+            abs_index_right = current_position - 1 # right start at 0
+            while counter < self.radius:
+                abs_index_left -= 1 # สมมุติตำแหน่งที่ 0 จะได้ -1, -2, -3, -4, -5 (radius = 5)
+                char_left = sentence[abs_index_left]
+                while char_left == self.delimiter:
+                    abs_index_left -= 1
+                    char_left = sentence[abs_index_left]
+                relative_index_left = -counter - 1
+                # เก็บตัวหนังสือ
+                chars_left = char_left + chars_left
+                # ใส่ลง feature
+                if indiv_char:
+                    left_key = '|'.join([str(relative_index_left), char_left])
+                    if return_type == 'dict':
+                        features[left_key] = 1
+                    else:
+                        features.append(left_key)
+
+                abs_index_right += 1 # สมมุติคือตำแหน่งที่ 0 จะได้ 0, 1, 2, 3, 4 (radius = 5)
+                char_right = sentence[abs_index_right]
+                while char_right == self.delimiter:
+                    abs_index_right += 1
+                    char_right = sentence[abs_index_right]
+                relative_index_right = counter
+                chars_right += char_right
+                if indiv_char:
+                    right_key = '|'.join([str(relative_index_right), char_right])
+                    if return_type == 'dict':
+                        features[right_key] = 1
+                    else:
+                        features.append(right_key)
+
+                counter += 1
+
+            chars = chars_left + chars_right
+            for i in range(0, len(chars) - self.N + 1):
+                ngram = chars[i:i + self.N]
+                ngram_key = '|'.join([str(i - self.radius), ngram])
+                if return_type == 'dict':
+                    features[ngram_key] = 1
+                else:
+                    features.append(ngram_key)
+            all_features.append(features)
+            if(return_type == 'list'):
+              cut = str(cut)
+            all_labels.append(cut)
+
+        return {
+            'X': all_features,
+            'Y': all_labels
+        }
+_to_feature = Featurizer()
+
+
+def segment(text: str) -> List[str]:
+    x=_to_feature.featurize(text)["X"]
+    y_pred = tagger.tag(x)
+    list_cut = []
+    for i,(j,k) in enumerate(zip(list(text),y_pred)):
+        if k=="1":
+            list_cut.append(j)
+        else:
+            list_cut[-1]+=j
+    return list_cut
diff --git a/tests/test_tokenize.py b/tests/test_tokenize.py
@@ -399,6 +399,7 @@ def test_subword_tokenize(self):
             "า" in subword_tokenize("สวัสดีชาวโลก", engine="dict")
         )
         self.assertEqual(subword_tokenize(None, engine="ssg"), [])
+        self.assertEqual(subword_tokenize(None, engine="han_solo"), [])
         self.assertEqual(
             subword_tokenize("แมวกินปลา", engine="ssg"), ["แมว", "กิน", "ปลา"]
         )
@@ -408,6 +409,15 @@ def test_subword_tokenize(self):
         self.assertFalse(
             "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="ssg")
         )
+        self.assertEqual(
+            subword_tokenize("แมวกินปลา", engine="han_solo"), ["แมว", "กิน", "ปลา"]
+        )
+        self.assertTrue(
+            "ดาว" in subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
+        )
+        self.assertFalse(
+            "า" in subword_tokenize("สวัสดีดาวอังคาร", engine="han_solo")
+        )
         self.assertFalse(
             " " in subword_tokenize("พันธมิตร ชา นม", keep_whitespace=False)
         )