### Create Trie class

In [81]:
import pickle
import json
# Trie class
class Trie:
    # init Trie class
    def __init__(self):
        self.root = self.getNode()

    def getNode(self):
        return {"isEndOfWord": False, "children": {}}

    def insertWord(self, word):
        current = self.root
        for ch in word:

          # if current["children"].has_key(ch):
            if ch in current["children"]:
                node = current["children"][ch]
            else:
                node = self.getNode()
                current["children"][ch] = node

            current = node
        current["isEndOfWord"] = True

    def searchWord(self, word):
        current = self.root
        for ch in word:
            if ch not in current["children"]:
                return False
            node = current["children"][ch]

            current = node
        return current["isEndOfWord"]

    def searchWordPrefix(self, word):
        current = self.root
        for ch in word:
            if ch not in current["children"]:
                return False
            node = current["children"][ch]

            current = node
        # return True if children contain keys and values
        return bool(current["children"])

    def deleteWord(self, word):
        self._delete(self.root, word, 0)

    def _delete(self, current, word, index):
        if(index == len(word)):
            if not current["isEndOfWord"]:
                return False
            current["isEndOfWord"] = False
            return len(current["children"].keys()) == 0

        ch = word[index]
        # if not current["children"].has_key(ch):
        if ch not in current["children"]:
            return False
        node = current["children"][ch]

        should_delete_current_node = self._delete(node, word, index + 1)

        if should_delete_current_node:
            current["children"].pop(ch)
            return len(current["children"].keys()) == 0

        return False

    def save_to_pickle(self, file_name):
        f = open(file_name + ".pkl", "wb")
        pickle.dump(self.root, f)
        f.close()

    def load_from_pickle(self, file_name):
        f = open(file_name + ".pkl", "rb")
        self.root = pickle.load(f)
        f.close()

    def save_to_json(self, file_name):
        json_data = json.dumps(self.root)
        f = open(file_name + ".json", "w")
        f.write(json_data)
        f.close()

    def load_from_json(self, file_name):
        json_file = open(file_name + ".json", "r")
        self.root = json.load(json_file)
        json_file.close()



### Let start train our model

In [82]:
def reverse_string(str):
    return str[::-1]

In [83]:
modelFW = Trie()
modelBF = Trie()
input_file_path = "data/sea.txt"
with open(input_file_path, "r") as f:
    words = f.read().split("\n")
print("Training start")

for word in words:
    if not bool(word.strip()):
        continue

    modelFW.insertWord(word)
    modelBF.insertWord(reverse_string(word))

modelFW.save_to_pickle("fw_train_data_v2")
modelBF.save_to_pickle("bw_train_data_v2")
print("Training completed")

Training start
Training completed


### Lest test our model.

In [89]:
print(modelFW.searchWord('')) # should False
print(modelFW.searchWordPrefix('កំពុ')) # should be True
print(modelFW.searchWord('គ្រុ')) # should be False
#print(model.searchWordPrefix('គ្រុ')) # should be True
#print(model.searchWordPrefix('សួ')) # should be False
print('----------------') # should False
print(modelBF.searchWord('')) # should False
print(modelBF.searchWord(reverse_string('គ្រុយ'))) # should be True
print(modelBF.searchWord(reverse_string('គ្រុ'))) # should be False

False
True
False
----------------
False
True
False










### Let build model for seperated word from sentences

In [123]:
class WordSegmentation:
    # init Trie class
    def __init__(self, text):
        self.text = text#.decode('utf-8')
        self.modelFW = Trie()
        self.modelBW = Trie()
        self.modelFW.load_from_pickle("fw_train_data_v2")
        self.modelBW.load_from_pickle("bw_train_data_v2")
        # self.result = []
        self.result_all = []
        # self.leftover = []
        self.startIndex = 0

    def isNumber(self, ch):
        # number letter
        return ch in "0123456789០១២៣៤៥៦៧៨៩"

    def parseNumber(self, index, text):
        result = ""
        while (index < len(text)):
            ch = self.text[index]
            ch = ch#.encode('utf-8')
            if self.isNumber(ch):
                result += self.text[index]
                index += 1
            else:
                return result

        return result
    def isEnglish(self, ch):
        return ch in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

    def parseEnglish(self, index, text):
        result = ""
        while (index < len(text)):
            ch = self.text[index]
            ch = ch#.encode('utf-8')
            if (self.isEnglish(ch) or self.isNumber(ch)):
                result += ch;
                index += 1
            else:
                return result
        return result

    def parseTrie(self, index, text, model):
        word = ''
        foundWord = ''

        while (index < len(text)):
            ch = text[index]
            ch = ch#.encode('utf-8')
            word += ch
            if model.searchWordPrefix(word):
                if model.searchWord(word):
                    foundWord = word
            elif model.searchWord(word):
                return word
            else:
                return foundWord;

            index += 1

        return foundWord
    
    def FW(self):
        result = []
        error = 0
        self.startIndex = 0
        text = self.text
        while(self.startIndex < len(text)):
            ch = self.text[self.startIndex]
            ch = ch#.encode('utf-8')
            word = ''

            if self.isNumber(ch):
                word = self.parseNumber(self.startIndex, text)#.encode('utf-8')
            elif self.isEnglish(ch):
                word = self.parseEnglish(self.startIndex, text)#.encode('utf-8')
            else:
                word = self.parseTrie(self.startIndex, text, self.modelFW)

            length = len(word)#.decode('utf-8'))
            if length == 0:
                error += 1
                result.append(ch)#.decode('utf-8'))
                self.startIndex += 1
                continue

            rw = {}
            rw["text"] = word

            result.append(rw)
            self.startIndex += length
            
        return (error, result)

    def BW(self):
        result = []
        error = 0
        errorWord = ''
        self.startIndex = 0
        text = reverse_string(self.text)
        while(self.startIndex < len(text)):
            ch = self.text[self.startIndex]
            ch = ch#.encode('utf-8')
            word = ''

            if self.isNumber(ch):
                word = self.parseNumber(self.startIndex, text)#.encode('utf-8')
            elif self.isEnglish(ch):
                word = self.parseEnglish(self.startIndex, text)#.encode('utf-8')
            else:
                word = self.parseTrie(self.startIndex, text, self.modelBW)

            length = len(word)#.decode('utf-8'))
            if length == 0:
                error += 1
                result.append(ch)#.decode('utf-8'))
                errorWord += ch
                self.startIndex += 1
                continue

            if len(errorWord) > 0:
                result.append(errorWord)
                errorWord = ''
            rw = {}
            rw["text"] = reverse_string(word)

            result.append(rw)
            self.startIndex += length
        return (error, reverse_string(result))
        

    def check_words(self):
        fw_error_count, resultFW = self.FW()
        bw_error_count, resultBW = self.BW()
        print('Use FW error:', fw_error_count)
        print('Use BW error:', bw_error_count)
        if fw_error_count <= bw_error_count:
            print('Use FW')
            self.result_all.append(resultFW)
        else:
            print('Use BW')
            self.result_all.append(resultBW)


    def show(self):
        print('Text: ' + self.text)
        print(self.result_all)
        #print('After check : [' + ', '.join(self.result_all) + ']')

In [125]:

# kh_text = "អ្នកចេះនិយាយភាសាខ្មែរទេ?"
# kh_text = "ចំណេះ​ដឹង​វិទ្យាសាស្ត្រ​ជា​ចំណុច​គាំទ្រ​ដ៏​សំខាន់​មួយ​ក្នុង​ការ​អភិវឌ្ឍ​សេដ្ឋកិច្ច​សង្គម។ "
# kh_text = "ដឹង​វិទ្យាសាស្ត្រ​ជា​ចំណុច​គាំទ្រ​ដ៏​សំខាន់​មួយ​ក្នុង​ការ​អភិវឌ្ឍ​សេដ្ឋកិច្ច​សង្គម។"
# kh_text = "កំពុងលុបការឃោសនារបស់ពួកជ្រុលនិយមលឿនជាងបច្ចុប្បន្ន បើមិនដូច្នេះទេ"
kh_text = "មនុស្សម្នាក់"
kh_text = "សហភាពអឺរ៉ុបបានផ្ដល់ពេល៣ខែឲ្"
# kh_text = "ខាងក្រុមហ៊ុនរបស់យើងខ្ញុំត្រូវការជ្រើសរើសនិសិ្សតកម្ពុជាយើងដែលកំពុងរៀនផ្នែកពត័មានវិទ្យានិងផ្នែកទូរគមនាគមន៍"

word_segment = WordSegmentation(kh_text)
word_segment.check_words()
word_segment.show()

Use FW error: 2
Use BW error: 7
Use FW
Text: សហភាពអឺរ៉ុបបានផ្ដល់ពេល៣ខែឲ្
[[{'text': 'សហភាព'}, {'text': 'អឺរ៉ុប'}, {'text': 'បាន'}, {'text': 'ផ្ដល់'}, {'text': 'ពេល'}, {'text': '៣'}, {'text': 'ខែ'}, 'ឲ', '្']]
