Skip to content

Commit

Permalink
Add ðŸŠŋ Han-solo
Browse files Browse the repository at this point in the history
ðŸŠŋ Han-solo: Thai syllable segmenter

This work wants to create a Thai syllable segmenter that can work in the Thai social media domain.

GitHub: https://github.com/PyThaiNLP/Han-solo
  • Loading branch information
wannaphong committed Jul 30, 2023
1 parent cc9139c commit 10dcbac
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 2 deletions.
Binary file added pythainlp/corpus/han_solo.crfsuite
Binary file not shown.
11 changes: 9 additions & 2 deletions pythainlp/tokenize/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,10 +530,15 @@ def subword_tokenize(
**Options for engine**
* *dict* - newmm word tokenizer with a syllable dictionary
* *etcc* - Enhanced Thai Character Cluster (Inrut et al. 2001)
* *ssg* - CRF syllable segmenter for Thai
* *han_solo* - CRF syllable segmenter for Thai that can work in the \
Thai social media domain. See `PyThaiNLP/Han-solo \
<https://github.com/PyThaiNLP/Han-solo>`_.
* *ssg* - CRF syllable segmenter for Thai. See `ponrawee/ssg \
<https://github.com/ponrawee/ssg>`_.
* *tcc* (default) - Thai Character Cluster (Theeramunkong et al. 2000)
* *tcc_p* - Thai Character Cluster + improve the rule that used in newmm
* *tltk* - syllable tokenizer from tltk
* *tltk* - syllable tokenizer from tltk. See `tltk \
<https://pypi.org/project/tltk/>`_.
* *wangchanberta* - SentencePiece from wangchanberta model
:Example:
Expand Down Expand Up @@ -600,6 +605,8 @@ def subword_tokenize(
from pythainlp.tokenize.ssg import segment
elif engine == "tltk":
from pythainlp.tokenize.tltk import syllable_tokenize as segment
elif engine == "han_solo":
from pythainlp.tokenize.han_solo import segment
else:
raise ValueError(
f"""Tokenizer \"{engine}\" not found.
Expand Down
140 changes: 140 additions & 0 deletions pythainlp/tokenize/han_solo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
# -*- coding: utf-8 -*-
# Copyright (C) 2016-2023 PyThaiNLP Project
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from pythainlp.corpus import path_pythainlp_corpus
try:
import pycrfsuite
except ImportError:
raise ImportError("ImportError; Install pycrfsuite by pip install python-crfsuite")

tagger = pycrfsuite.Tagger()
tagger.open(path_pythainlp_corpus('han_solo.crfsuite'))


class Featurizer:
# This class from ssg at https://github.com/ponrawee/ssg.
# Copyright 2019 Ponrawee Prasertsom

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at

# http://www.apache.org/licenses/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# {
# "0 (current anchor)|+1 (the character on the right from anchor)|A (character)" : 1
# }

def __init__(self, N=2, sequence_size=1, delimiter=None):
self.N = N
self.delimiter = delimiter
self.radius = N + sequence_size
pass

def pad(self, sentence, padder='#'):
return padder * (self.radius) + sentence + padder * (self.radius)

def featurize(self, sentence, padding=True, indiv_char=True, return_type='list'):
if padding:
sentence = self.pad(sentence)
all_features = []
all_labels = []
skip_next = False
for current_position in range(self.radius, len(sentence) - self.radius + 1):
if skip_next:
skip_next = False
continue
features = {}
if return_type == 'list':
features = []
cut = 0
char = sentence[current_position]
if char == self.delimiter:
cut = 1
skip_next = True
counter = 0
chars_left = ''
chars_right = ''
chars = ''
abs_index_left = current_position # left start at -1
abs_index_right = current_position - 1 # right start at 0
while counter < self.radius:
abs_index_left -= 1 # āļŠāļĄāļĄāļļāļ•āļīāļ•āļģāđāļŦāļ™āđˆāļ‡āļ—āļĩāđˆ 0 āļˆāļ°āđ„āļ”āđ‰ -1, -2, -3, -4, -5 (radius = 5)
char_left = sentence[abs_index_left]
while char_left == self.delimiter:
abs_index_left -= 1
char_left = sentence[abs_index_left]
relative_index_left = -counter - 1
# āđ€āļāđ‡āļšāļ•āļąāļ§āļŦāļ™āļąāļ‡āļŠāļ·āļ­
chars_left = char_left + chars_left
# āđƒāļŠāđˆāļĨāļ‡ feature
if indiv_char:
left_key = '|'.join([str(relative_index_left), char_left])
if return_type == 'dict':
features[left_key] = 1
else:
features.append(left_key)

abs_index_right += 1 # āļŠāļĄāļĄāļļāļ•āļīāļ„āļ·āļ­āļ•āļģāđāļŦāļ™āđˆāļ‡āļ—āļĩāđˆ 0 āļˆāļ°āđ„āļ”āđ‰ 0, 1, 2, 3, 4 (radius = 5)
char_right = sentence[abs_index_right]
while char_right == self.delimiter:
abs_index_right += 1
char_right = sentence[abs_index_right]
relative_index_right = counter
chars_right += char_right
if indiv_char:
right_key = '|'.join([str(relative_index_right), char_right])
if return_type == 'dict':
features[right_key] = 1
else:
features.append(right_key)

counter += 1

chars = chars_left + chars_right
for i in range(0, len(chars) - self.N + 1):
ngram = chars[i:i + self.N]
ngram_key = '|'.join([str(i - self.radius), ngram])
if return_type == 'dict':
features[ngram_key] = 1
else:
features.append(ngram_key)
all_features.append(features)
if(return_type == 'list'):
cut = str(cut)
all_labels.append(cut)

return {
'X': all_features,
'Y': all_labels
}
_to_feature = Featurizer()


def segment(text: str) -> List[str]:
x=_to_feature.featurize(text)["X"]
y_pred = tagger.tag(x)
list_cut = []
for i,(j,k) in enumerate(zip(list(text),y_pred)):
if k=="1":
list_cut.append(j)
else:
list_cut[-1]+=j
return list_cut
10 changes: 10 additions & 0 deletions tests/test_tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,7 @@ def test_subword_tokenize(self):
"āļē" in subword_tokenize("āļŠāļ§āļąāļŠāļ”āļĩāļŠāļēāļ§āđ‚āļĨāļ", engine="dict")
)
self.assertEqual(subword_tokenize(None, engine="ssg"), [])
self.assertEqual(subword_tokenize(None, engine="han_solo"), [])
self.assertEqual(
subword_tokenize("āđāļĄāļ§āļāļīāļ™āļ›āļĨāļē", engine="ssg"), ["āđāļĄāļ§", "āļāļīāļ™", "āļ›āļĨāļē"]
)
Expand All @@ -408,6 +409,15 @@ def test_subword_tokenize(self):
self.assertFalse(
"āļē" in subword_tokenize("āļŠāļ§āļąāļŠāļ”āļĩāļ”āļēāļ§āļ­āļąāļ‡āļ„āļēāļĢ", engine="ssg")
)
self.assertEqual(
subword_tokenize("āđāļĄāļ§āļāļīāļ™āļ›āļĨāļē", engine="han_solo"), ["āđāļĄāļ§", "āļāļīāļ™", "āļ›āļĨāļē"]
)
self.assertTrue(
"āļ”āļēāļ§" in subword_tokenize("āļŠāļ§āļąāļŠāļ”āļĩāļ”āļēāļ§āļ­āļąāļ‡āļ„āļēāļĢ", engine="han_solo")
)
self.assertFalse(
"āļē" in subword_tokenize("āļŠāļ§āļąāļŠāļ”āļĩāļ”āļēāļ§āļ­āļąāļ‡āļ„āļēāļĢ", engine="han_solo")
)
self.assertFalse(
" " in subword_tokenize("āļžāļąāļ™āļ˜āļĄāļīāļ•āļĢ āļŠāļē āļ™āļĄ", keep_whitespace=False)
)
Expand Down

0 comments on commit 10dcbac

Please sign in to comment.