### Create Trie class

In [3]:
import pickle
import json
# Trie class
class Trie:
  # init Trie class
  def __init__(self):
    self.root = self.getNode()

  def getNode(self):
    return {"isEndOfWord": False, "children": {}}

  def insertWord(self, word):
    current = self.root
    for ch in word:

      # if current["children"].has_key(ch):
      if ch in current["children"]:
        node = current["children"][ch]
      else:
        node = self.getNode()
        current["children"][ch] = node

      current = node
    current["isEndOfWord"] = True

  def searchWord(self, word):
    current = self.root
    for ch in word:
      if ch not in current["children"]:
        return False
      node = current["children"][ch]

      current = node
    return current["isEndOfWord"]

  def searchWordPrefix(self, word):
    current = self.root
    for ch in word:
      if ch not in current["children"]:
        return False
      node = current["children"][ch]

      current = node
    # return True if children contain keys and values
    return bool(current["children"])

  def deleteWord(self, word):
    self._delete(self.root, word, 0)

  def _delete(self, current, word, index):
    if(index == len(word)):
      if not current["isEndOfWord"]:
        return False
      current["isEndOfWord"] = False
      return len(current["children"].keys()) == 0

    ch = word[index]
    # if not current["children"].has_key(ch):
    if ch not in current["children"]:
      return False
    node = current["children"][ch]

    should_delete_current_node = self._delete(node, word, index + 1)

    if should_delete_current_node:
      current["children"].pop(ch)
      return len(current["children"].keys()) == 0

    return False

  def save_to_pickle(self, file_name):
    f = open(file_name + ".pkl", "wb")
    pickle.dump(self.root, f)
    f.close()

  def load_from_pickle(self, file_name):
    f = open(file_name + ".pkl", "rb")
    self.root = pickle.load(f)
    f.close()

  def save_to_json(self, file_name):
    json_data = json.dumps(self.root)
    f = open(file_name + ".json", "w")
    f.write(json_data)
    f.close()

  def load_from_json(self, file_name):
    json_file = open(file_name + ".json", "r")
    self.root = json.load(json_file)
    json_file.close()



### Let start train our model

In [10]:
model = Trie()
input_file_path = "data/sea.txt"
with open(input_file_path, "r") as f:
  words = f.read().split("\n")
print("Training start")

for word in words:
  if not bool(word.strip()):
    continue

  model.insertWord(word)

#model.save_to_pickle("train_data_v2")
print("Training completed")

Training start
Training completed


### Lest test our model.

In [17]:
print(model.searchWord('')) # should False
print(model.searchWord('គ្រុយ')) # should be True
print(model.searchWord('គ្រុ')) # should be False
#print(model.searchWordPrefix('គ្រុ')) # should be True
#print(model.searchWordPrefix('សួ')) # should be False

False
True
False


### Let build model for seperated word from sentences

In [33]:
class WordSegmentation:
  # init Trie class
  def __init__(self, text):
    self.text = text#.decode('utf-8')
    self.model = Trie()
    self.model.load_from_pickle("train_data_v2")
    # self.result = []
    self.result_all = []
    # self.leftover = []
    self.startIndex = 0

  def isNumber(self, ch):
    # number letter
    return ch in "0123456789០១២៣៤៥៦៧៨៩"

  def parseNumber(self, index):
    result = ""
    while (index < len(self.text)):
      ch = self.text[index]
      ch = ch#.encode('utf-8')
      if self.isNumber(ch):
        result += self.text[index]
        index += 1
      else:
        return result

    return result
  def isEnglish(self, ch):
    return ch in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"

  def parseEnglish(self, index):
    result = ""
    while (index < len(self.text)):
      ch = self.text[index]
      ch = ch#.encode('utf-8')
      if (self.isEnglish(ch) or self.isNumber(ch)):
        result += ch;
        index += 1
      else:
        return result
    return result

  def parseTrie(self, index):
    word = ''
    foundWord = ''

    while (index < len(self.text)):
      ch = self.text[index]
      ch = ch#.encode('utf-8')
      word += ch
      if self.model.searchWordPrefix(word):
        if self.model.searchWord(word):
          foundWord = word
      elif self.model.searchWord(word):
        return word
      else:
        return foundWord;

      index += 1

    return ""

  def check_words(self):
    while(self.startIndex < len(self.text)):
      ch = self.text[self.startIndex]
      ch = ch#.encode('utf-8')
      word = ''

      if self.isNumber(ch):
        word = self.parseNumber(self.startIndex)#.encode('utf-8')
      elif self.isEnglish(ch):
        word = self.parseEnglish(self.startIndex)#.encode('utf-8')
      else:
        word = self.parseTrie(self.startIndex)

      length = len(word)#.decode('utf-8'))
      if length == 0:
        self.result_all.append(ch)#.decode('utf-8'))
        self.startIndex += 1
        continue

      result = {}
      if self.model.searchWord(word) or self.isNumber(ch) or self.isEnglish(ch):
        #self.result_all.append()
        result["text"] = word#.decode('utf-8')
      else:
        result["text"] = word#.decode('utf-8')

      self.result_all.append(result)
      self.startIndex += length

    # # write to file
    # with open('result/segment_word.txt', "w", encoding="utf-8") as f:
    #   for word in self.result:
    #     f.write(word + "\n")
    # with open('result/segment_not_word.txt', "w", encoding="utf-8") as f:
    #   for word in self.leftover:
    #     f.write(word + "\n")

  def show(self):
    print('Text: ' + self.text)
    print(self.result_all)
    #print('After check : [' + ', '.join(self.result_all) + ']')

In [37]:

# kh_text = "អ្នកចេះនិយាយភាសាខ្មែរទេ?"
# kh_text = "ចំណេះ​ដឹង​វិទ្យាសាស្ត្រ​ជា​ចំណុច​គាំទ្រ​ដ៏​សំខាន់​មួយ​ក្នុង​ការ​អភិវឌ្ឍ​សេដ្ឋកិច្ច​សង្គម។ "
# kh_text = "ដឹង​វិទ្យាសាស្ត្រ​ជា​ចំណុច​គាំទ្រ​ដ៏​សំខាន់​មួយ​ក្នុង​ការ​អភិវឌ្ឍ​សេដ្ឋកិច្ច​សង្គម។"
kh_text = "កំពុងលុបការឃោសនារបស់ពួកជ្រុលនិយមលឿនជាងបច្ចុប្បន្ន បើមិនដូច្នេះទេ"
#kh_text = "សហភាពអឺរ៉ុបបានផ្ដល់ពេល៣ខែឲ្"
#kh_text = "ខាងក្រុមហ៊ុនរបស់យើងខ្ញុំត្រូវការជ្រើសរើសនិសិ្សតកម្ពុជាយើងដែលកំពុងរៀនផ្នែកពត័មានវិទ្យានិងផ្នែកទូរគមនាគមន៍"

word_segment = WordSegmentation(kh_text)
word_segment.check_words()
word_segment.show()

Text: កំពុងលុបការឃោសនារបស់ពួកជ្រុលនិយមលឿនជាងបច្ចុប្បន្ន បើមិនដូច្នេះទេ
[{'text': 'កំពុង'}, {'text': 'លុប'}, {'text': 'ការឃោសនា'}, {'text': 'របស់'}, {'text': 'ពួក'}, {'text': 'ជ្រុល'}, {'text': 'និយម'}, {'text': 'លឿន'}, {'text': 'ជាង'}, {'text': 'បច្ចុប្បន្ន'}, ' ', {'text': 'បើមិន'}, 'ដ', 'ូ', {'text': 'ច្នេះ'}, 'ទ', 'េ']
