In [14]:
from io import open
from sklearn.metrics import accuracy_score
from conllu import parse_incr
import random

In [15]:
SYNTAG = 'ud-treebanks-v2.5/UD_Russian-SynTagRus/ru_syntagrus-ud-train.conllu'

In [36]:
from enum import IntEnum, auto

class POS(IntEnum):
    START = 0
    ADJ = auto()
    ADP = auto()
    ADV = auto()
    AUX = auto()
    CCONJ = auto()
    DET = auto()
    INTJ = auto()
    NOUN = auto()
    NUM = auto()
    PART = auto()
    PRON = auto()
    PROPN = auto()
    PUNCT = auto()
    SCONJ = auto()
    SYM = auto()
    VERB = auto()
    X = auto()
    END = auto()

items = POS
items

<enum 'POS'>

In [65]:
def wrap_tags(tags):
    return [POS.START] + tags + [POS.END]

def parse_tags(tt):
    tags = []
    for i, tag in enumerate(tt):
        if tag in POS.__dict__.keys():
            tags.append(POS.__dict__[tag])
        else:
            tags.append(POS.X)

    return wrap_tags(tags)

In [66]:
def add_tag_token_helper(token, tokens, tags):
    tokens.append(token['lemma'].lower())
    tags.append(token['upostag'])
    
def parse_sent(sent):
    tokens = []
    tags = []
    for token in sent:
        if token['lemma'] != '_' and token['upostag'] != '_':
            add_tag_token_helper(token, tokens, tags)

    return tokens, parse_tags(tags)

In [67]:
def read_dataset(data_path):
    data_file = open(data_path, "r", encoding="utf-8")
    ss = parse_incr(data_file)
    return list(ss)

In [68]:
read_dataset(SYNTAG)

[TokenList<Анкета, .>,
 TokenList<Начальник, областного, управления, связи, Семен, Еремеевич, был, человек, простой, ,, приходил, на, работу, всегда, вовремя, ,, здоровался, с, секретаршей, за, руку, и, иногда, даже, писал, в, стенгазету, заметки, под, псевдонимом, ", Муха, ", .>,
 TokenList<В, приемной, его, с, утра, ожидали, посетители, ,, -, кое-кто, с, важными, делами, ,, а, кое-кто, и, с, такими, ,, которые, легко, можно, было, решить, в, нижестоящих, инстанциях, ,, не, затрудняя, Семена, Еремеевича, .>,
 TokenList<Однако, стиль, работы, Семена, Еремеевича, заключался, в, том, ,, чтобы, принимать, всех, желающих, и, лично, вникать, в, дело, .>,
 TokenList<Приемная, была, обставлена, просто, ,, но, по-деловому, .>,
 TokenList<У, двери, стоял, стол, секретарши, ,, на, столе, -, пишущая, машинка, с, широкой, кареткой, .>,
 TokenList<В, углу, висел, репродуктор, и, играло, радио, для, развлечения, ожидающих, и, еще, для, того, ,, чтобы, заглушать, голос, начальника, ,, доносившийся, и

In [69]:
def get_data(data_path):
    sens = read_dataset(data_path)
    res = [parse_sent(sent.tokens) for sent in sens]
    return list(zip([i[0] for i in res], [i[1] for i in res]))

def compute_initial_observer(token_lists, tag_lists):
    c = np.zeros(len(items))
    for tag_list in tag_lists:
        for t in tag_list:
            c[t] += 1

    ob = {}
    for id, tok in enumerate(token_lists):
        for idex, token in enumerate(tok):
            if token not in ob:
                ob[token] = np.zeros(len(items))
            ob[token][tag_lists[id][idex + 1]] += (1 / c[tag_lists[id][idex + 1]])

    return ob

In [70]:
train = get_data(SYNTAG)
for sent, tags in train[:10]:
    print(sent)
    print(tags)
    print()

['анкета', '.']
[<POS.START: 0>, <POS.NOUN: 8>, <POS.PUNCT: 13>, <POS.END: 18>]

['начальник', 'областной', 'управление', 'связь', 'семен', 'еремеевич', 'быть', 'человек', 'простой', ',', 'приходить', 'на', 'работа', 'всегда', 'вовремя', ',', 'здороваться', 'с', 'секретарша', 'за', 'рука', 'и', 'иногда', 'даже', 'писать', 'в', 'стенгазета', 'заметка', 'под', 'псевдоним', '"', 'муха', '"', '.']
[<POS.START: 0>, <POS.NOUN: 8>, <POS.ADJ: 1>, <POS.NOUN: 8>, <POS.NOUN: 8>, <POS.PROPN: 12>, <POS.PROPN: 12>, <POS.AUX: 4>, <POS.NOUN: 8>, <POS.ADJ: 1>, <POS.PUNCT: 13>, <POS.VERB: 16>, <POS.ADP: 2>, <POS.NOUN: 8>, <POS.ADV: 3>, <POS.ADV: 3>, <POS.PUNCT: 13>, <POS.VERB: 16>, <POS.ADP: 2>, <POS.NOUN: 8>, <POS.ADP: 2>, <POS.NOUN: 8>, <POS.CCONJ: 5>, <POS.ADV: 3>, <POS.PART: 10>, <POS.VERB: 16>, <POS.ADP: 2>, <POS.NOUN: 8>, <POS.NOUN: 8>, <POS.ADP: 2>, <POS.NOUN: 8>, <POS.PUNCT: 13>, <POS.NOUN: 8>, <POS.PUNCT: 13>, <POS.PUNCT: 13>, <POS.END: 18>]

['в', 'приемная', 'он', 'с', 'утро', 'ожидать', 'пос

In [102]:
def helper_compute_normalized(transitions):
    s = np.sum(transitions, axis=1)
    s[POS.END] = np.sum(transitions[:, POS.END])
    dx = np.zeros((len(items), len(items)))
    for i in range(len(transitions)):
        if s[i] > 0:
            dx[i][:] = np.divide(transitions[i][:], s[i])
    return dx


def learn_transitions(tag_lists):
    """ [START, tag, tag, ..., END] """
    transitions = np.zeros((len(POS), len(POS)))
    for unit_list in tag_lists:
        for i in range(len(unit_list) - 1):
            transitions[unit_list[i], unit_list[i + 1]] += 1

    s = np.sum(transitions, axis=1)
    s[POS.END] = np.sum(transitions[:, POS.END])

    div = np.zeros((len(POS), len(POS)))
    for i in range(len(transitions)):
        if s[i] > 0:
            div[i][:] = np.divide(transitions[i][:], s[i])

    return div


def train_test_split(data, train: float, shuffle=False):
    if shuffle:
        random.shuffle(data)
    len_ = len(data)
    idx = round(len_ * train)
    train_ = data[:idx]
    test_ = data[idx:]
    return train_, test_


In [103]:
def train_markov(dataset):
    token_lists, tag_lists = zip(*dataset)
    tss = learn_transitions(tag_lists)
    oss = compute_initial_observer(token_lists, tag_lists)
    return tss, oss

def helper_compute_probs(trans, T, tokens, M, B, unknown_tokens, obs):
    for t in range(2, T - 1):
        for s in range(1, len(items) - 1):
            state_probs = M[:, t - 1] * trans[:, s]
            max_prob_state = int(np.argmax(state_probs))
            M[s, t] = state_probs[max_prob_state] * obs[tokens[t - 1]][s]
            B[s, t - 1] = max_prob_state

    state_probs = M[:, T - 2] * trans[:, len(items) - 1]
    max_prob_state = int(np.argmax(state_probs))
    M[len(items) - 1, T - 1] = state_probs[max_prob_state]
    B[len(items ) - 1, T - 2] = max_prob_state

    T = B.shape[1] + 1
    path = np.zeros(T, dtype=np.int)
    path[T - 1] = POS.END
    for t in range(T - 1, 0, -1):
        path[t - 1] = B[path[t], t - 1]
    return path, unknown_tokens

def report_count(sens_acc, tot_acc, unn_acc):
    return [
        'sentence accuracy - {}'.format(sens_acc),
        'token    accuracy - {}'.format(tot_acc),
        'unknown  tokens   - {}'.format(unn_acc),
    ]

def viterbi(tokens, trans, obs):
    unknown_tokens = []
    T = len(tokens) + 2
    X = np.zeros((len(items), T))
    B = np.zeros((len(items), T - 1), dtype=np.int)
    X[0, 0] = 1
    for t in tokens:
        if t not in obs:
            obs[t] = np.ones(len(items))
            unknown_tokens.append(t)
    for s in range(1, len(items)):
        X[s, 1] = trans[0, s] * obs[tokens[0]][s]

    return helper_compute_probs(trans, T, tokens, X, B, unknown_tokens, obs)


def evaluate(data, trans, obs):
    ground_truth, predictions = [], []

    corr = 0
    unknown_tokens = []
    all_tokens_count = 0

    for idx, item in enumerate(data):
        tokens = item[0]
        tags = item[1]

        all_tokens_count += len(tokens)

        predicted_tags, ut = viterbi(tokens, trans, obs)
        unknown_tokens.extend(ut)

        predictions.extend(predicted_tags)
        ground_truth.extend(tags)

        correspond = np.asarray([1 if p == t else 0 for p, t in zip(predicted_tags, tags)])
        corr += np.prod(correspond)

    sens_acc = corr / len(data)
    tot_acc = accuracy_score(ground_truth, predictions)
    unn_acc = len(unknown_tokens) / all_tokens_count
    return report_count(sens_acc, tot_acc, unn_acc)


In [104]:
def run_train_test_eval(train_dataset, fraction=0.8):

    train_data, test_data = train_test_split(train_dataset, fraction, shuffle=True)
    tts, obs = train_markov(train_data)

    results = evaluate(test, tts, obs)
    print('Model Results:')
    print("\n".join(results))
    return tts, obs, results


In [105]:
dataset = get_data(SYNTAG)

In [106]:
train, test = train_test_split(dataset, 0.8, shuffle=True)

In [107]:
results = run_train_test_eval(dataset, 0.8)

Model Results:
sentence accuracy - 0.7115640684215917
token    accuracy - 0.9785820857128158
unknown  tokens   - 0.004260941181853538


In [108]:
import numpy as np
for train_fraction in np.arange(0.1, 1, 0.1):
    print('train fraction {}'.format(train_fraction))
    run_train_test_eval(dataset, train_fraction)
    print('=====')



train fraction 0.1
Model Results:
sentence accuracy - 0.4326538973676124
token    accuracy - 0.9446005041930339
unknown  tokens   - 0.05145157969389856
=====
train fraction 0.2
Model Results:
sentence accuracy - 0.5257605244289665
token    accuracy - 0.958723053969234
unknown  tokens   - 0.036438196335018644
=====
train fraction 0.30000000000000004
Model Results:
sentence accuracy - 0.5889583119942641
token    accuracy - 0.96573030817513
unknown  tokens   - 0.025983162133101506
=====
train fraction 0.4
Model Results:
sentence accuracy - 0.6232715353887125
token    accuracy - 0.970298914441529
unknown  tokens   - 0.019823385417858205
=====
train fraction 0.5
Model Results:
sentence accuracy - 0.6584041790433268
token    accuracy - 0.9734475484899933
unknown  tokens   - 0.014212669579739653
=====
train fraction 0.6
Model Results:
sentence accuracy - 0.6783775478848715
token    accuracy - 0.9756495343931677
unknown  tokens   - 0.010334927135046098
=====
train fraction 0.7000000000000001
M