In [200]:
import pandas as pd
import numpy as np
from tqdm import tqdm

pd.options.mode.chained_assignment = None

def flatten(t):
    return [item for sublist in t for item in sublist]

In [5]:
DATA_FILES = [
    "./data/natasha_pymorphy.csv",
    "./data/rnnmorph.csv",
    "./data/deepPavlov.csv",
]
TEST_FILE = './data/test.txt'
COLS = [
    ["morph_stim_tag", "morph_word_tag", "nata_stim_tag", "nata_word_tag"],
    ["rnn_stim_tag", "rnn_word_tag"],
    ["DP_stim_tag", "DP_word_tag"],
]

In [18]:
datas = []
for file in DATA_FILES:
    datas.append(pd.read_csv(file))

In [19]:
merged_data = pd.concat([datas[0], datas[1][COLS[1]], datas[2][COLS[2]]], axis = 1)
merged_data.head()

Unnamed: 0,id,stim,word,phrase,morph_stim_tag,morph_word_tag,nata_stim_tag,nata_word_tag,rnn_stim_tag,rnn_word_tag,DP_stim_tag,DP_word_tag
0,1,абсолютный,-,абсолютный -,A,,ADJ,,ADJ,,ADJ,
1,1,абсолютный,0,абсолютный 0,A,,ADJ,,ADJ,,ADJ,
2,1,абсолютный,100%,абсолютный 100%,A,,ADJ,,ADJ,,ADJ,
3,1,абсолютный,100%-ный,абсолютный 100%-ный,A,,ADJ,,ADJ,,ADJ,
4,1,абсолютный,max,абсолютный max,A,,ADJ,,ADJ,,ADJ,


In [62]:
# merged_data[merged_data[COLS[2][0]]=="X"]

In [191]:
#https://universaldependencies.org/u/pos/all.html#al-u-pos/PART

tag2tag = {
    "A": "A",
    "ADJ": "A",
    "ADP": "nan", # adposition 2,...
    "ADV": "ADV",
    "AUX": "nan", # auxilary
    "CCONJ": "CONJ",
    "COMP": "COMP", #                                         МОЖНО?
    "CONJ": "CONJ",
    "DET": "nan", # determiner
    "H": "nan", # ?????
    "INTJ": "INTJ",
    "NOUN": "S",
    "NUM": "NUM",
    "PART": "PART",
    "PHRASE": "PHRASE",
    "PR": "PR",
    "PRAEDIC": "PR",
    "PRON": "PRO",
    "PROPN": "S",
    "PUNCT": "nan", #PUNCT: punctuation
    "S": "S",
    "SCONJ": "CONJ",
    "SPRO": "PRO",
    "UNKN": "UNKN",
    "V": "V",
    "VERB": "V",
    "X": "nan", #? X: other
    "nan": "nan",
}

In [192]:
stim_tag_cols = flatten(COLS)[0::2]


def split_str(x):
    try:
        return x.split("||")
    except AttributeError:
        return [x]


all_tags = []
for col in stim_tag_cols:
    all_tags.append(np.unique(flatten(list(merged_data[col].apply(split_str)))))
all_tags_u = np.unique(flatten(all_tags))
# for tag in all_tags:
#     print(f'\'{tag}\':\'\',')
all_tags_u

array(['A', 'ADJ', 'ADP', 'ADV', 'AUX', 'COMP', 'CONJ', 'DET', 'H',
       'INTJ', 'NOUN', 'NUM', 'PART', 'PRAEDIC', 'PRON', 'PROPN', 'PUNCT',
       'S', 'SCONJ', 'SPRO', 'V', 'VERB', 'X'], dtype='<U7')

In [193]:
test = pd.read_csv(TEST_FILE, header=None)
test.columns = ['test']
test['word'] = test.test.apply(lambda x: x.split('_')[0])
test['tag'] = test.test.apply(lambda x: x.split('_')[1])
test

Unnamed: 0,test,word,tag
0,абсолютный_A,абсолютный,A
1,автомат_S,автомат,S
2,авторитет_S,авторитет,S
3,агрессивный_A,агрессивный,A
4,адвокат_S,адвокат,S
...,...,...,...
335,коммунист_S,коммунист,S
336,комната_S,комната,S
337,комфорт_S,комфорт,S
338,конец_S,конец,S


In [183]:
checked_dataset = merged_data[['stim']+stim_tag_cols]
checked_dataset

Unnamed: 0,stim,morph_stim_tag,nata_stim_tag,rnn_stim_tag,DP_stim_tag
0,абсолютный,A,ADJ,ADJ,ADJ
1,абсолютный,A,ADJ,ADJ,ADJ
2,абсолютный,A,ADJ,ADJ,ADJ
3,абсолютный,A,ADJ,ADJ,ADJ
4,абсолютный,A,ADJ,ADJ,ADJ
...,...,...,...,...,...
167884,ярость,S,VERB,NOUN,NOUN
167885,ярость,S,VERB,NOUN,NOUN
167886,ярость,S,VERB,NOUN,NOUN
167887,ярость,S,VERB,NOUN,NOUN


In [184]:
checked_cols = []

for col in stim_tag_cols:
    checked_cols.append(f'{col}_check')
    checked_dataset.loc[:,[checked_cols[-1]]] = None
checked_dataset

Unnamed: 0,stim,morph_stim_tag,nata_stim_tag,rnn_stim_tag,DP_stim_tag,morph_stim_tag_check,nata_stim_tag_check,rnn_stim_tag_check,DP_stim_tag_check
0,абсолютный,A,ADJ,ADJ,ADJ,,,,
1,абсолютный,A,ADJ,ADJ,ADJ,,,,
2,абсолютный,A,ADJ,ADJ,ADJ,,,,
3,абсолютный,A,ADJ,ADJ,ADJ,,,,
4,абсолютный,A,ADJ,ADJ,ADJ,,,,
...,...,...,...,...,...,...,...,...,...
167884,ярость,S,VERB,NOUN,NOUN,,,,
167885,ярость,S,VERB,NOUN,NOUN,,,,
167886,ярость,S,VERB,NOUN,NOUN,,,,
167887,ярость,S,VERB,NOUN,NOUN,,,,


In [199]:
def make_matrix():
    t = np.unique(list(tag2tag.values()))
    n = len(t)
    matrix = pd.DataFrame(np.zeros((n, n)), index=t, columns=t, dtype=int)
    return matrix

In [201]:
def check(pred):
    def map_tag2tag(x):
        try:
            return tag2tag[x]
        except KeyError:
            return "nan"

    predicted_tags = set(list(map(map_tag2tag, pred.split("||"))))
    true_tags = set(tags)
    for t in true_tags:
        for p in predicted_tags:
            matrix.loc[t][p] += 1
    return len(predicted_tags & true_tags) > 0


matrixes = dict()
for col in stim_tag_cols:
    matrixes.update({col: make_matrix()})
for key, val in tqdm(test.iterrows()):
    word = val.word
    tag = val.tag
    tags = tag.split("||")
    trues = dict()
    for col in stim_tag_cols:
        matrix = matrixes[col]
        row_flag = checked_dataset.stim == word
        checked_dataset.loc[row_flag, [f"{col}_check"]] = checked_dataset[row_flag][
            col
        ].apply(check)

checked_dataset

340it [00:30, 11.25it/s]


Unnamed: 0,stim,morph_stim_tag,nata_stim_tag,rnn_stim_tag,DP_stim_tag,morph_stim_tag_check,nata_stim_tag_check,rnn_stim_tag_check,DP_stim_tag_check
0,абсолютный,A,ADJ,ADJ,ADJ,True,True,True,True
1,абсолютный,A,ADJ,ADJ,ADJ,True,True,True,True
2,абсолютный,A,ADJ,ADJ,ADJ,True,True,True,True
3,абсолютный,A,ADJ,ADJ,ADJ,True,True,True,True
4,абсолютный,A,ADJ,ADJ,ADJ,True,True,True,True
...,...,...,...,...,...,...,...,...,...
167884,ярость,S,VERB,NOUN,NOUN,,,,
167885,ярость,S,VERB,NOUN,NOUN,,,,
167886,ярость,S,VERB,NOUN,NOUN,,,,
167887,ярость,S,VERB,NOUN,NOUN,,,,


In [202]:
check_data = checked_dataset[~checked_dataset.DP_stim_tag_check.isna()]
n = check_data.shape[0]
np.sum(check_data[checked_cols], axis = 0)/n

morph_stim_tag_check         1.0
nata_stim_tag_check     0.934627
rnn_stim_tag_check       0.98406
DP_stim_tag_check       0.981462
dtype: object

In [205]:
for col in stim_tag_cols:
    print(col)
    m = matrixes[col]
#     m[(m!= 0).any(axis=1),(m!= 0).any(axis=0)]
    display(m.loc[(m!= 0).any(axis=1),(m!= 0).any(axis=0)])

morph_stim_tag


Unnamed: 0,A,ADV,CONJ,INTJ,PART,PR,PRO,S,V
A,12203,339,0,0,0,0,0,825,120
ADV,1124,1597,0,0,0,0,0,226,0
CONJ,329,329,131,0,131,0,0,131,0
INTJ,0,0,0,172,0,0,0,0,172
PRO,0,0,0,0,0,0,185,0,0
S,1333,498,320,172,131,223,0,33197,1968
V,0,0,189,172,0,0,0,571,11603


nata_stim_tag


Unnamed: 0,A,ADV,PART,PRO,S,V,nan
A,10995,1,15,66,514,265,347
ADV,26,1127,15,66,244,2,117
CONJ,0,0,15,66,131,0,117
INTJ,0,0,0,0,0,169,3
PRO,0,0,0,185,0,0,0
S,1986,8,3,132,30326,929,126
V,203,154,1,0,484,10608,153


rnn_stim_tag


Unnamed: 0,A,ADV,PART,PRO,S,V,nan
A,11623,0,0,103,16,0,461
ADV,45,1142,1,103,211,0,95
CONJ,11,0,0,103,120,0,95
INTJ,0,0,0,0,0,172,0
PRO,0,0,0,185,0,0,0
S,931,0,0,0,31723,670,186
V,0,0,0,0,219,11384,0


DP_stim_tag


Unnamed: 0,A,ADV,INTJ,PART,PRO,S,V,nan
A,11652,0,0,9,116,123,0,303
ADV,22,1191,0,9,116,186,0,73
CONJ,1,0,0,9,116,130,0,73
INTJ,0,0,0,0,0,0,171,1
PRO,0,0,0,0,185,0,0,0
S,911,37,1,0,1,31661,736,163
V,0,0,0,0,0,299,11220,84
