In [19]:
from collections import defaultdict
from tqdm import tqdm
from requests import request, ConnectionError
from json import loads

word_dict = defaultdict(None)
label_func = defaultdict(None)
known_prefixes = set()
known_suffixes = set()

EVQR_AFFIX = '<evqr.affix>'
PREFIX_AND_SUFFIX = '<prefix.and.suffix>'
VOWEL = '<vowel>'

try:
    with open('.env.json') as f:
        ENV_VARIABLES = loads(f.read())
        f.close()
except FileNotFoundError:
    ENV_VARIABLES = {'DATA_DIR': 'C:\\'}
DATA_DIR = ENV_VARIABLES['DATA_DIR']
FTP_DIR = 'http://m106.nthu.edu.tw/~s106062341/morpheme_finder_data/'


class Word:

    @staticmethod
    def create_synonym_postfix(word, delete=None, append=None):
        return f'{word}{f"--{delete}--" if delete is not None else ""}{f"++{append}++" if append is not None else ""}'

    @staticmethod
    def create_synonym_prefix(word, delete=None, append=None):
        return f'{f"--{delete}--" if delete is not None else ""}{f"++{append}++" if append is not None else ""}{word}'

    @staticmethod
    def letter_cmp(a, b):
        divider = 0
        for i, (letter_a, letter_b) in enumerate(zip(a, b)):
            if letter_a != letter_b:
                divider = i
        return min(divider, len(a), len(b))

    def __init__(self, text, affix_list):
        self.text = text
        self.affix_list = affix_list
        self.synonym = defaultdict(None)
        self.label = defaultdict(None)

    @property
    def count(self):
        return sum([c for c in self.synonym.values()])

    def create_label(self, label_name, *args):
        if label_name not in label_func:
            return False
        self.label[label_name] = label_func[label_name](self, *args)
        return True


def get_file(filename: str, callback: classmethod) -> bool:
    try:
        with open(f'{DATA_DIR}{filename}', 'r') as f:
            callback(f.read())
            f.close()
            return True
    except FileNotFoundError:
        try:
            res = request('GET', f'{FTP_DIR}{filename}')
            res.encoding = 'Big5'
            callback(res.text)
            return True
        except ConnectionError:
            print('HTTP connection failed')
            return False
        except Exception as e:
            print(f'Load failed: {e}')
            return False


def load_vocabulary():
    def callback(content):
        for line in content.split('\n')[1:-1]:
            word, *affix_list = line.replace('-', '').split(' ')[:-1]
            word_dict[word] = (Word(word, affix_list))
    if get_file('EVQR.word.and.affix.txt', callback):
        print('Load done')


def load_prefix_and_suffix():

    def prefix_callback(content):
        for line in content.split('\n')[1:-1]:
            known_prefixes.update(filter(lambda x: len(x) > 0, line[:-1].strip().replace('-', '').split(', ')))

    def suffix_callback(content):
        for line in content.split('\n'):
            known_suffixes.update(filter(lambda x: len(x) > 0, line[:-1].strip().replace('-', '').split(', ')))

    if get_file('prefixes.txt', prefix_callback) and get_file('suffixes.txt', suffix_callback):
        print('Load prefixes & suffixes done')


def mapping_label_func():
    def evqr_affix(word):
        text = word.text
        label = [0] * len(text)
        pos = 0
        for affix in word.affix_list:
            if affix.lower() in text:
                label[text.find(affix, pos)] = 1 if pos != 0 else 0
                pos = text.find(affix, pos) + len(affix)
            else:
                k = Word.letter_cmp(text[pos:], affix)
                if k > 1:
                    label[pos] = 1 if pos != 0 else 0
                    pos += 1

        return [t for t in zip(text, label)]

    def vowel(word):
        vowels = {"a", "e", "i", "o", "u"}
        return [(letter, int(letter in vowels)) for letter in word.text]

    def prefix_and_suffix(word):
        word_len = len(word.text)
        label = [0] * word_len

        for i in range(word_len):
            pattern = word.text[:word_len - 1 - i]
            if pattern in known_prefixes:
                label[len(pattern)] = 1

        for i in range(word_len):
            pattern = word.text[i + 1:]
            if pattern in known_suffixes:
                label[i] = 2 if label[i] == 0 else 3

        return [t for t in zip(word.text, label)]

    label_func[EVQR_AFFIX] = evqr_affix
    label_func[VOWEL] = vowel
    label_func[PREFIX_AND_SUFFIX] = prefix_and_suffix
    print('Mapping done')


def create_label_data():
    for word in tqdm(word_dict.values()):
        if not word.create_label(EVQR_AFFIX):
            print('Failed at label with EVQR.affix')
            return False
        if not word.create_label(VOWEL):
            print('Failed at label with Vowel')
            return False
        if not word.create_label(PREFIX_AND_SUFFIX):
            print('Failed at label with prefix & suffix')
            return False
    print('Label done')
    return True

# if __name__ == '__main__':
#     load_vocabulary()
#     load_prefix_and_suffix()
#     mapping_label_func()
#     if create_label_data():
#         for w in word_dict.values():
#             print(w.label[EVQR_AFFIX])  # data base on EVQR.word.and.suffix.txt
#             # print(w.label[PREFIX_AND_SUFFIX])  # data base on prefixes.txt & suffixes.txt
#             # print(w.label[VOWEL])  # data base on vowel's position in the word

In [29]:
prepared_words = []



load_vocabulary()
load_prefix_and_suffix()
mapping_label_func()
if create_label_data():
    for w in word_dict.values():
#         print(w.label[EVQR_AFFIX])  # data base on EVQR.word.and.suffix.txt
        # print(w.label[PREFIX_AND_SUFFIX])  # data base on prefixes.txt & suffixes.txt
#         print(w.label[VOWEL])  # data base on vowel's position in the word
        prepared_words.append(w.label[VOWEL])

100%|██████████| 5237/5237 [00:00<00:00, 57683.15it/s]

Load done
Load prefixes & suffixes done
Mapping done
Label done





In [30]:
len(prepared_words)

5237

In [31]:
def create_char_features(word, i):
    features = [
        'bias',
        'char=' + word[i][0] 
    ]

    if i >= 1:
        features.extend([
            'char-1=' + word[i-1][0],
            'char-1:0=' + word[i-1][0] + word[i][0],
        ])
    else:
        features.append("BOS")

    if i >= 2:
        features.extend([
            'char-2=' + word[i-2][0],
            'char-2:0=' + word[i-2][0] + word[i-1][0] + word[i][0],
            'char-2:-1=' + word[i-2][0] + word[i-1][0],
        ])

#     if i >= 3:
#         features.extend([
#             'char-3:0=' + word[i-3][0] + word[i-2][0] + word[i-1][0] + word[i][0],
#             'char-3:-1=' + word[i-3][0] + word[i-2][0] + word[i-1][0],
#         ])
    return features

In [32]:
def create_word_features(prepared_word):
    return [create_char_features(prepared_word, i) for i in range(len(prepared_word))]

def create_word_labels(prepared_word):
    return [str(part[1]) for part in prepared_word]

X = [create_word_features(pw) for pw in prepared_words[:4000]]
y = [create_word_labels(pw)   for pw in prepared_words[:4000]]

X_test = [create_word_features(pw) for pw in prepared_words[4000:]]
y_test = [create_word_labels(pw)   for pw in prepared_words[4000:]]

In [33]:
create_word_features("phyllode")

[['bias', 'char=p', 'BOS'],
 ['bias', 'char=h', 'char-1=p', 'char-1:0=ph'],
 ['bias',
  'char=y',
  'char-1=h',
  'char-1:0=hy',
  'char-2=p',
  'char-2:0=phy',
  'char-2:-1=ph'],
 ['bias',
  'char=l',
  'char-1=y',
  'char-1:0=yl',
  'char-2=h',
  'char-2:0=hyl',
  'char-2:-1=hy'],
 ['bias',
  'char=l',
  'char-1=l',
  'char-1:0=ll',
  'char-2=y',
  'char-2:0=yll',
  'char-2:-1=yl'],
 ['bias',
  'char=o',
  'char-1=l',
  'char-1:0=lo',
  'char-2=l',
  'char-2:0=llo',
  'char-2:-1=ll'],
 ['bias',
  'char=d',
  'char-1=o',
  'char-1:0=od',
  'char-2=l',
  'char-2:0=lod',
  'char-2:-1=lo'],
 ['bias',
  'char=e',
  'char-1=d',
  'char-1:0=de',
  'char-2=o',
  'char-2:0=ode',
  'char-2:-1=od']]

In [34]:
import pycrfsuite

In [35]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X, y):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 1.0,
    'c2': 1e-3,
    'max_iterations': 50,
    'feature.possible_transitions': True
})

trainer.train('word-segmentation.crfsuite')

tagger = pycrfsuite.Tagger()
tagger.open('word-segmentation.crfsuite')


def segment_word(word):
    w = word.replace(" ", "")
    prediction = tagger.tag(create_word_features(w))
    complete = ""
    for i, p in enumerate(prediction):
        if int(p) >= 1:
            complete += " " + w[i]
        else:
            complete += w[i]
    return complete

In [36]:
segment_word("phyllode")

'phyll od e'