# Morpheme Finder
[TOC]


## Import & Define Env Variables

In [14]:
from collections import defaultdict
from tqdm import tqdm
from requests import request, ConnectionError
from json import loads
from random import sample
from math import ceil
import pycrfsuite

word_dict = defaultdict(None)
label_func = defaultdict(None)
known_prefixes = set()
known_suffixes = set()

EVQR_AFFIX = '<evqr.affix>'
PREFIX_AND_SUFFIX = '<prefix.and.suffix>'
VOWEL = '<vowel>'

CROSS_VALIDATION_FOLD = 5

In [15]:
try:
    with open('.env.json') as f:
        ENV_VARIABLES = loads(f.read())
        f.close()
except FileNotFoundError:
    ENV_VARIABLES = {'DATA_DIR': 'C:\\'}
DATA_DIR = ENV_VARIABLES['DATA_DIR']
FTP_DIR = 'http://m106.nthu.edu.tw/~s106062341/morpheme_finder_data/'

### Class Word

In [16]:
class Word:

    @staticmethod
    def create_synonym_postfix(word, delete=None, append=None):
        return f'{word}{f"--{delete}--" if delete is not None else ""}{f"++{append}++" if append is not None else ""}'

    @staticmethod
    def create_synonym_prefix(word, delete=None, append=None):
        return f'{f"--{delete}--" if delete is not None else ""}{f"++{append}++" if append is not None else ""}{word}'

    @staticmethod
    def letter_cmp(a, b):
        divider = 0
        for i, (letter_a, letter_b) in enumerate(zip(a, b)):
            if letter_a != letter_b:
                divider = i
        return min(divider, len(a), len(b))

    def __init__(self, text, affix_list):
        self.text = text
        self.affix_list = affix_list
        self.synonym = defaultdict(None)
        self.label = defaultdict(None)

    @property
    def count(self):
        return sum([c for c in self.synonym.values()])

    def create_label(self, label_name, *args):
        if label_name not in label_func:
            return False
        self.label[label_name] = label_func[label_name](self, *args)
        return True

## Data Accessing
### first provide a method to access files either in local storage or in FTP

In [17]:
def get_file(filename: str, callback: classmethod) -> bool:
    try:
        with open(f'{DATA_DIR}{filename}', 'r') as f:
            callback(f.read())
            f.close()
            return True
    except FileNotFoundError:
        try:
            res = request('GET', f'{FTP_DIR}{filename}')
            res.encoding = 'Big5'
            callback(res.text)
            return True
        except ConnectionError:
            print('HTTP connection failed')
            return False
        except Exception as e:
            print(f'Load failed: {e}')
            return False

### Load Data
includes:
1. *EVQR.word.and.affix.txt'*
2. *prefixes.txt*
3. *suffixes.txt*

In [18]:
def evqr_word_and_suffix_callback(content):
    for line in content.split('\n')[1:-1]:
        word, *affix_list = line.replace('-', '').split(' ')[:-1]
        word_dict[word] = (Word(word, affix_list))
if get_file('EVQR.word.and.affix.txt', evqr_word_and_suffix_callback):
    print('Load done')

Load done


In [19]:
def prefix_callback(content):
    for line in content.split('\n')[1:-1]:
        known_prefixes.update(filter(lambda x: len(x) > 0, line[:-1].strip().replace('-', '').split(', ')))

def suffix_callback(content):
    for line in content.split('\n'):
        known_suffixes.update(filter(lambda x: len(x) > 0, line[:-1].strip().replace('-', '').split(', ')))

if get_file('prefixes.txt', prefix_callback) and get_file('suffixes.txt', suffix_callback):
    print('Load prefixes & suffixes done')

Load prefixes & suffixes done


## Labelize Word
### Mapping Label Function
because different label has its label function respectively

In [20]:
def evqr_affix(word):
    text = word.text
    label = [0] * len(text)
    pos = 0
    for affix in word.affix_list:
        if affix.lower() in text:
            label[text.find(affix, pos)] = 1 if pos != 0 else 0
            pos = text.find(affix, pos) + len(affix)
        else:
            k = Word.letter_cmp(text[pos:], affix)
            if k > 1:
                label[pos] = 1 if pos != 0 else 0
                pos += 1

    return [t for t in zip(text, label)]

def vowel(word):
    vowels = {"a", "e", "i", "o", "u"}
    return [(letter, int(letter in vowels)) for letter in word.text]

def prefix_and_suffix(word):
    word_len = len(word.text)
    label = [0] * word_len

    for i in range(word_len):
        pattern = word.text[:word_len - 1 - i]
        if pattern in known_prefixes:
            label[len(pattern)] = 1

    for i in range(word_len):
        pattern = word.text[i + 1:]
        if pattern in known_suffixes:
            label[i] = 2 if label[i] == 0 else 3

    return [t for t in zip(word.text, label)]

label_func[EVQR_AFFIX] = evqr_affix
label_func[VOWEL] = vowel
label_func[PREFIX_AND_SUFFIX] = prefix_and_suffix
print('Mapping done')

Mapping done


### Create Label for each Word

In [21]:
for word in tqdm(word_dict.values()):
    if not word.create_label(EVQR_AFFIX):
        print('Failed at label with EVQR.affix')
    if not word.create_label(VOWEL):
        print('Failed at label with Vowel')
    if not word.create_label(PREFIX_AND_SUFFIX):
        print('Failed at label with prefix & suffix')
print('Label done')

100%|██████████| 5237/5237 [00:00<00:00, 64802.26it/s]

Label done





In [22]:
print(f'labeled by EVQR.word.and.suffix: ignoble -> {word_dict["ignoble"].label[EVQR_AFFIX]}')
print(f'labeled by prefix & suffix     : demagog -> {word_dict["demagog"].label[PREFIX_AND_SUFFIX]}')
print(f'labeled by position of vowels  : amphibology -> {word_dict["amphibology"].label[VOWEL]}')

labeled by EVQR.word.and.suffix: ignoble -> [('i', 0), ('g', 0), ('n', 1), ('o', 0), ('b', 0), ('l', 0), ('e', 0)]
labeled by prefix & suffix     : demagog -> [('d', 0), ('e', 0), ('m', 1), ('a', 0), ('g', 0), ('o', 0), ('g', 0)]
labeled by position of vowels  : amphibology -> [('a', 1), ('m', 0), ('p', 0), ('h', 0), ('i', 1), ('b', 0), ('o', 1), ('l', 0), ('o', 1), ('g', 0), ('y', 0)]


In [9]:
prepared_word = []
for word in tqdm(word_dict.values()):
    prepared_word.append(word.label[PREFIX_AND_SUFFIX])

100%|██████████| 5237/5237 [00:00<00:00, 1312709.62it/s]


In [10]:
def create_char_features(word, i):
    features = [
        'bias',
        'char=' + word[i][0]
    ]

    if i >= 1:
        features.extend([
            'char-1=' + word[i-1][0],
            'char-1:0=' + word[i-1][0] + word[i][0],
        ])
    else:
        features.append("BOS")

    if i >= 2:
        features.extend([
            'char-2=' + word[i-2][0],
            'char-2:0=' + word[i-2][0] + word[i-1][0] + word[i][0],
            'char-2:-1=' + word[i-2][0] + word[i-1][0],
        ])
    return features


def create_word_features(prepared_word):
    return [create_char_features(prepared_word, i) for i in range(len(prepared_word))]


def create_word_labels(prepared_word):
    return [str(part[1]) for part in prepared_word]

In [28]:
remained_samples = list(word_dict.values())
sample_set_size = ceil(len(remained_samples) / CROSS_VALIDATION_FOLD)
sample_list = []
for i in range(CROSS_VALIDATION_FOLD - 1):
    remained_samples_len = len(remained_samples)
    samples_idx = sample(range(remained_samples_len), sample_set_size)
    sample_list.append(set([remained_samples[idx] for idx in samples_idx]))
    remained_samples = [remained_samples[j] for j in range(remained_samples_len) if j not in samples_idx]
sample_list.append(set(remained_samples))

 
['horoscope', 'heteroplasty', 'bicuspid', 'amebocyte', 'aquarelle', 'argentite', 'bidirectional', 'parallax', 'heteroploid', 'aquarium', 'perigee', 'epigenesis', 'bifid', 'autunite', 'cytochrome', 'quadraphonic', 'hepatica', 'bifocal', 'bauxite', 'hepatitis', 'aquavit', 'anamnesis', 'heterosporous', 'asymptote', 'perimenopause', 'quadrifid', 'paramedic', 'bornite', 'anapest', 'cytokinin', 'centimeter', 'quadrilateral', 'taconite', 'tachylite', 'invoke', 'paramilitary', 'proclaim', 'heterotroph', 'homeomorphism', 'calcite', 'irreverent', 'tantalite', 'tektite', 'quadripartite', 'centipede', 'anaphylaxis', 'perinatal', 'retinoscope', 'procrastinate', 'cytopathic', 'anaplasia', 'irrupt', 'binary', 'cassiterite', 'vocabulary', 'atom', 'centurion', 'paraphernalia', 'homograft', 'anastomosis', 'period', 'cinquecento', 'vociferous', 'paraplegia', 'cytotaxonomy', 'isocyclic', 'quadruple', 'vocoder', 'electrohydraulic', 'cytotoxin', 'paraprofessional', 'homologate', 'bipartite', 'anatropous',

In [None]:
for l in sample_list:
    print(' ')
    print([w.text for w in l])

In [None]:

def train(folds):
    trainer = pycrfsuite.Trainer(verbose=False)
    for w in folds:
        trainer.append(create_word_features(w),
                       create_word_labels(w))

    trainer.set_params({
        'c1': 1.0,
        'c2': 1e-3,
        'max_iterations': 50,
        'feature.possible_transitions': True
    })
    trainer.train('word-segmentation.crfsuite')


def test(fold_idx):
    tagger = pycrfsuite.Tagger()
    tagger.open('word-segmentation.crfsuite')
    for word in sample_list[fold_idx]:
        w = word.replace(" ", "")
        prediction = tagger.tag(create_word_features(w))
        complete = ""
        for i, p in enumerate(prediction):
            if int(p) >= 1:
                complete += " " + w[i]
            else:
                complete += w[i]
        return complete

