# Morpheme Finder with Neural Network
We will do morpheme segmentation with neural network in this notebook.
## Import & Constants Definition

In [131]:
from json import loads
from math import ceil
from random import sample
from requests import request, ConnectionError
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neural_network import MLPClassifier
from tqdm.notebook import tqdm

In [132]:
# env variables
try:
    with open('.env.json') as f:
        ENV_VARIABLES = loads(f.read())
        f.close()
except FileNotFoundError:
    ENV_VARIABLES = {'DATA_DIR': 'C:\\'}
DATA_DIR = ENV_VARIABLES['DATA_DIR']
FTP_DIR = 'http://m106.nthu.edu.tw/~s106062341/morpheme_finder_data/'

# file accessor
def get_file(filename: str, callback: classmethod) -> bool:
    try:
        with open(f'{DATA_DIR}{filename}', 'r') as f:
            callback(f.read())
            f.close()
            return True
    except FileNotFoundError:
        try:
            res = request('GET', f'{FTP_DIR}{filename}')
            res.encoding = 'Big5'
            callback(res.text)
            return True
        except ConnectionError:
            print('HTTP connection failed')
            return False
        except Exception as e:
            print(f'Load failed: {e}')
            return False

class Word:
    def __init__(self, text, origin_affix_list, affix_list=None) -> None:
        self.text = text
        self.origin_affix_list = origin_affix_list
        self.affix_list = affix_list if affix_list else origin_affix_list
        self.label = None
        self.create_label()
        
    def create_label(self) -> None:
        text = self.text
        label = [0] * len(text)
        pos = 0
        for affix in self.affix_list[1:]:
            prev_pos = text.find(affix, pos)
            label[prev_pos] = 1
            pos = prev_pos + len(affix)
        self.label = label
        
    def get_breakpoints_index(self) -> list:
        return [position for position, label in enumerate(self.label) if label]

## Load Data & Create its Label
1. CELEX.word.and.root.txt

In [133]:
word_dict = {}
bad_celex = []

def celex_word_and_root_callback(content: str) -> any:
    for line in tqdm(content.split('\r\n')):
        word, *origin_affix_list = line.split(' ')
        if word == ''.join(origin_affix_list):
            word_dict[word] = Word(word, origin_affix_list)
        else:
            bad_celex.append(line)
if get_file('CELEX.word.and.root.txt', celex_word_and_root_callback):
    print(f'Load CELEX.word.and.root.txt done [{len(word_dict.keys())} / {len(bad_celex)}]')

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20066.0), HTML(value='')))


Load CELEX.word.and.root.txt done [11770 / 8296]


## Create Feature for Each Character
Features include:
1. character itself
2. character position
3. is character a vowel
4. distance to the previous breakpoint
5. distance to the next breakpoint
6. how many breakpoints in preceding substring
7. how many breakpoints in succeeding substring
8. word length

In [168]:
vowels = {'a', 'e', 'i', 'o', 'u'}
def get_ascii(char: str) -> float:
    return (ord(char.lower()) - 110) / 26

def create_char_feature(word: Word) -> list:
    MAX = 100
    text = word.text
    bps = word.get_breakpoints_index() + [MAX]
    features = []
    for idx, character in enumerate(text):
        dis2prev_bp = -1
        dis2next_bp = -1
        prec_bp_count = 0
        succ_bp_count = 0
        for i, bp in enumerate(bps):
            if idx < bp:
                if i > 0:
                    dis2prev_bp = (idx - bps[i-1])
                if bp < MAX:
                    dis2next_bp = bp - idx
                prec_bp_count = i
                succ_bp_count = len(bps) - 1 - i
                break
            if idx == bp:
                dis2prev_bp = 0
                dis2next_bp = 0
                prec_bp_count = i + 1
                succ_bp_count = len(bps) - 2 - i
                break
        features.append([
            # 1,
            get_ascii(character),
#             (idx * 2 / (len(text) - 1)) - 1,
            int(character in vowels),
            dis2prev_bp,
            dis2next_bp,
            # prec_bp_count,
            # succ_bp_count,
#             len(text)
        ])
        # features.append([
        #     'bias',
        #     f'char={character}',
        #     f'vowel={character in vowels}',
        #     f'dis2prev_and_next_bp={dis2prev_bp}:{dis2next_bp}',
        #     f'prec_and_succ_bp_count={prec_bp_count}:{succ_bp_count}',
        # ])
    return features

## Create Train & Test Data

In [169]:
data_features = [item for w in word_dict.values() for item in create_char_feature(w)]
data_label = [label for w in word_dict.values() for label in w.label]

label_count = len(data_label)
pos_label = list(filter(lambda x: x[1], map(lambda t: (t[0], t[1]), enumerate(data_label))))
pos_label_count = len(pos_label)
neg_label_count = label_count - pos_label_count
if pos_label_count < neg_label_count:
    ratio = 1.5
    diff = ceil((neg_label_count - pos_label_count) * ratio)
    mod = diff % pos_label_count
    quotient = int((diff - mod) / pos_label_count)
    for _ in range(quotient):
        data_label += [1] * pos_label_count
        data_features += [data_features[i] for (i, _) in pos_label]
    data_label += [1] * mod
    data_features += [data_features[i] for (i, _) in sample(pos_label, mod)]
else:
    print('rev')
print(len(data_features), len(data_label))
print(len(list(filter(lambda x: x, data_label))))
    
train_X, test_X, train_y, test_y = train_test_split(data_features, data_label, test_size=0.5)
print(len(list(filter(lambda x: x, train_y))), len(train_y))
# def k_fold_cv(k):
#     p_x = [], n_x = []
#     for y, x in zip(data_label, data_features):
#         if y:
#             p_x.append(x)
#         else:
#             n_x.append(x)
#     p_ratio = len(n_x) / (len(n_x) + len(p_x))
#     n_ratio = len(p_x) / (len(n_x) + len(p_x))
#     p_sample_range = set(range(len(p_x)))
#     n_sample_range = set(range(len(n_x)))
#     sample_set_size = ceil((len(p_x) + len(n_x)) / k)
#     p_test_sample = set(sample(p_sample_range, ceil(p_ratio * sample_set_size)))
#     n_test_sample = set(sample(n_sample_range, ceil(n_ratio * sample_set_size)))
#     test_X = [p_x[i] for i in p_test_sample] + [n_x[i] for i in n_test_sample]
#     test_y = [1] * len(p_test_sample) + [0] * len(n_test_sample)
#     p_sample_range.difference_update(p_test_sample)
#     n_sample_range.difference_update(n_test_sample)
#     train_X = [p_x[i] for i in p_sample_range] + [n_x[i] for i in n_sample_range]
#     train_y = [1] * len(p_sample_range) + [0] * len(n_sample_range)
#     print(len(list(filter(lambda x: x, train_y))), len(train_y))
#     print(len(list(filter(lambda x: x, test_y))), len(test_y))
#     return train_X, train_y, test_X, test_y
# train_X, train_y, test_X, test_y = k_fold_cv(5)

225953 225953
132773
66251 112976


## Start Train & Test

In [170]:
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(train_X, train_y)

test_y_predicted = clf.predict(test_X)
print(len(list(filter(lambda x: x, test_y_predicted))), len(test_y_predicted))
precision, recall, fbeta_score, _ = precision_recall_fscore_support(test_y, test_y_predicted, average='weighted', zero_division=0)
print(precision, recall, fbeta_score)

scores = cross_val_score(clf, train_X + test_X, train_y + test_y, cv=5)
print(scores)

66522 112977
1.0 1.0 1.0
[1. 1. 1. 1. 1.]


In [171]:
def decompose(text: str, idx: int, prec_bp_count: int, succ_bp_count: int) -> tuple:
    max_accuracy, max_accuracy_idx = 0, 0
    if len(text) == 1 or prec_bp_count > 10 or succ_bp_count > 10:
        if prec_bp_count > 10:
            print(f'prec ct = {prec_bp_count}')
        if succ_bp_count > 10:
            print(f'succ ct = {succ_bp_count}')
        return text,
    for i in range(0, len(text)):
        t_y = [0] * len(text)
        t_y[i] = 1
        p_y = []
        for j, char in zip(range(0, len(text)), text):
            dis2prev_bp = j if (j < i) else j - i
            dis2next_bp = i - j if (j < i) else (len(text) - j)
            prec_bp_ct = prec_bp_count + (0 if j < i else 1)
            succ_bp_ct = succ_bp_count + (1 if j < i else 0)
#             pd = [get_ascii(char), ((idx + j) * 2 / (len(text) - 1)) - 1, int(char in vowels), dis2prev_bp, dis2next_bp, prec_bp_ct, succ_bp_ct, len(text)]
            pd = [get_ascii(char), int(char in vowels), dis2prev_bp, dis2next_bp]
#             pd = [1, get_ascii(char), (idx + j) / len(text), int(char in vowels)]
#             print(pd)
            p_y.append(pd)
#         for y in p_y:
#             print(y)
        p_y = clf.predict(p_y)
        acc = accuracy_score(t_y, p_y)
        print(text, t_y, p_y, acc)
        if max_accuracy < acc:
            max_accuracy = acc
            max_accuracy_idx = i
    prec = decompose(text[:max_accuracy_idx], 0, prec_bp_count, succ_bp_count+1) if max_accuracy_idx > 0 else None
    succ = decompose(text[max_accuracy_idx:], idx+max_accuracy_idx, prec_bp_count+1, succ_bp_count) if max_accuracy_idx > 0 and max_accuracy_idx <= (len(text) - 1) else None
    decomposition = (prec, succ)
    print(decomposition, max_accuracy)
    if prec and succ:
        return decomposition
    elif prec:
        return prec,
    elif succ:
        return succ,
    else:
        return text,
        
# input_text = input()
input_features = decompose('international', 0, 0, 0)


international [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [0 0 0 0 0 0 0 0 0 0 0 0 0] 0.9230769230769231
international [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [0 0 0 0 0 0 0 0 0 0 0 0 0] 0.9230769230769231
international [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [0 0 0 0 0 0 0 0 0 0 0 0 0] 0.9230769230769231
international [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0] [0 0 0 0 0 0 0 0 0 0 0 0 0] 0.9230769230769231
international [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0] [0 0 0 0 0 0 0 0 0 0 0 0 0] 0.9230769230769231
international [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0] [0 0 0 0 0 0 0 0 0 0 0 0 0] 0.9230769230769231
international [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0] [0 0 0 0 0 0 0 0 0 0 0 0 0] 0.9230769230769231
international [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0] [0 0 0 0 0 0 0 0 0 0 0 0 0] 0.9230769230769231
international [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0] [0 0 0 0 0 0 0 0 0 0 0 0 0] 0.9230769230769231
international [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0] [0 0 0 0 0 0 0 0 0 0 0 0 0] 0.9230769