In [142]:
import pandas as pd
import xml.etree.ElementTree as ET
import numpy as np

In [43]:
def getTags(xml_path=None, xml_string=None, sentence_tag="sentence"):
    tree = None
    if xml_path:
        tree = ET.parse(xml_path)
    else:
        tree = ET.ElementTree(ET.fromstring(xml_string))
    root = tree.getroot()
    df = pd.DataFrame(columns=["orth", "base", "ctag", "sentence"])
    i = 0
    if sentence_tag == "sentence":
        for sentence in root.iter('sentence'):
            df = pd.concat([df, getSentenceTags(sentence, i)], ignore_index=True)
            i += 1
    elif sentence_tag == "chunk":
        for chunk in root.iter('chunk'):
            if chunk.attrib["type"] == "s":
                df = pd.concat([df, getSentenceTags(chunk, i)], ignore_index=True)
                i += 1
    else:
        raise ValueError("Wrong tag or not implemented analysis")
    return df

def getSentenceTags(sentence, s_idx):
    sentence_df = pd.DataFrame(columns=["orth", "base", "ctag", "sentence"])
    for tok in sentence.iter('tok'):
        orth = tok.find('orth').text
        for lex in tok.iter('lex'):
            if "disamb" in lex.attrib and lex.attrib['disamb']=="1":
                base = lex.find('base').text
                ctag = lex.find('ctag').text
                record = pd.DataFrame(data={"orth": [orth], "base": [base], "ctag": [ctag], "sentence": s_idx})
                sentence_df = pd.concat([sentence_df, record], ignore_index=True)
    return sentence_df

In [44]:
clarin_df = getTags(xml_path="dataset/clarin/clarin-task-c.xml", sentence_tag="sentence")

In [47]:
clarin_df.to_csv("tagers_results/claring.csv")

In [4]:
clarin_df = pd.read_csv("tagers_results/claring.csv")

In [54]:
print(len(set(clarin_df["sentence"])))
clarin_df.iloc[:10]

1663


Unnamed: 0,orth,base,ctag,sentence
0,Moje,moja,subst:pl:nom:f,0
1,niefortunne,niefortunny,adj:sg:nom:n:pos,0
2,pudła,pudło,subst:sg:gen:n,0
3,przygnębiły,przygnębić,praet:pl:m2:perf,0
4,mnie,ja,ppron12:sg:gen:m1:pri:akc,0
5,do,do,prep:gen,0
6,reszty,reszta,subst:sg:gen:f,0
7,.,.,interp,0
8,Zawsze,zawsze,adv:pos,1
9,miałem,miał,subst:sg:inst:m3,1


In [48]:
gold_df = getTags(xml_path="dataset/PolEval/gold-task-c.xml", sentence_tag="chunk")

In [51]:
gold_df.to_csv("tagers_results/gold.csv")

In [38]:
gold_df = pd.read_csv("tagers_results/gold.csv")

In [55]:
print(len(set(gold_df["sentence"])))
gold_df.iloc[:10]

1671


Unnamed: 0,orth,base,ctag,sentence
0,Moje,mój,adj:pl:nom:n:pos,0
1,niefortunne,niefortunny,adj:pl:nom:n:pos,0
2,pudła,pudło,subst:pl:nom:n,0
3,przygnębiły,przygnębić,praet:pl:n:perf,0
4,mnie,ja,ppron12:sg:acc:m1:pri:akc,0
5,do,do,prep:gen,0
6,reszty,reszta,subst:sg:gen:f,0
7,.,.,interp,0
8,Zawsze,zawsze,adv,1
9,miał,mieć,praet:sg:m1:imperf,1


In [128]:
def checkResult(pred_df, true_df, skip_ctag=False):
    correct = []
    incorrect = []
    bad_sentence_len = []
    for s_idx in set(pred_df["sentence"]):
        true_s = true_df[true_df["sentence"] == s_idx]
        pred_s = pred_df[pred_df["sentence"] == s_idx]
        if skip_ctag:
            true_s = true_s.drop(['ctag'], axis=1)
            pred_s = pred_s.drop(['ctag'], axis=1)
        t_s_len = len(true_s)
        p_s_len = len(pred_s)
        cor = []
        incor = []
        shift = 0
        if len(pred_s) != len(true_s):
            bad_sentence_len.append(s_idx)
        for i in range(p_s_len):
            ts_s_i = i + shift
            if ts_s_i < t_s_len and\
                    pred_s.iloc[i].equals(true_s.iloc[ts_s_i]):
                cor.append(pred_s.index.tolist()[i])
            elif ts_s_i + 1 < t_s_len and\
                    pred_s.iloc[i].equals(true_s.iloc[ts_s_i + 1]):
                cor.append(pred_s.index.tolist()[i])
                shift += 1
            elif 0 <= ts_s_i - 1 < t_s_len and\
                    pred_s.iloc[i].equals(true_s.iloc[ts_s_i - 1]):
                cor.append(pred_s.index.tolist()[i])
                shift -= 1
            else:
                incor.append(pred_s.index.tolist()[i])
        correct.append(cor)
        incorrect.append(incor)
    return correct, incorrect, bad_sentence_len

def countNotNone(listOfLists):
    list_len = 0
    for list_ in listOfLists:
        if len(list_) != 0:
            list_len += 1
    return list_len
def printComparisonResult(correct, incorrect, bad_sentence_len):
    print("Correct: " + str(countNotNone(correct)))
    print("Incorrect: " + str(countNotNone(incorrect)))
    print("Bad lenght sentence: " + str(len(bad_sentence_len)))

In [129]:
correct, incorrect, bad_sentence_len = checkResult(clarin_df, gold_df, skip_ctag=True)

In [130]:
printComparisonResult(correct, incorrect, bad_sentence_len)

Correct: 493
Incorrect: 1648
Bad lenght sentence: 1560


In [132]:
incorrect[:10]

[[0],
 [9, 14, 15, 30],
 [47, 50],
 [63, 66, 72],
 [77, 87, 94, 95, 96, 97, 98, 99],
 [101, 102, 103, 104, 105, 106, 107, 108],
 [110, 111, 112, 113, 114, 115, 116, 117],
 [119, 120, 121, 123, 124, 125],
 [126, 127, 128, 129, 130, 131, 132],
 [134, 135, 137, 138, 139, 140, 141, 142, 143, 144, 145]]

In [139]:
def dataframe_difference(df1, df2, which=None):
    """Find rows which are different between two DataFrames."""
    comparison_df = df1.merge(df2,
                              indicator=True,
                              how='outer')
    if which is None:
        diff_df = comparison_df[comparison_df['_merge'] != 'both']
    else:
        diff_df = comparison_df[comparison_df['_merge'] == which]
    return diff_df
idx = 4
#dataframe_difference(clarin_df[clarin_df["sentence"]==idx], gold_df[gold_df["sentence"]==idx])
print(clarin_df[clarin_df["sentence"]==idx])
print(gold_df[gold_df["sentence"]==idx])

        orth      base                              ctag sentence
75         -         -                            interp        4
76     Pokaż   pokazać                  impt:sg:sec:perf        4
77        mi        mi                    subst:sg:nom:n        4
78  strzelbę  strzelba                    subst:sg:acc:f        4
79         -         -                            interp        4
80  poprosił  poprosić                  praet:sg:m1:perf        4
81         ,         ,                            interp        4
82         a         a                              conj        4
83       gdy       gdy                           adv:pos        4
84     podał     podać                  praet:sg:m1:perf        4
85        em       być            aglt:sg:pri:imperf:wok        4
86        mu        on  ppron3:sg:dat:m1:ter:nakc:npraep        4
87    mojego      moje                    subst:sg:gen:n        4
88   mauzera    mauzer                   subst:sg:gen:m2        4
89        

# Wersja bez sentencji

In [262]:
def checkResult(pred_df, true_df, skip_ctag=False, n_neigh=2):
    comparison = pd.DataFrame(columns=["pred_idx", "true_idx"])
    shift = 0
    list_true_idx = -1
    if skip_ctag:
        pred_df = pred_df.drop(['ctag'], axis=1)
        true_df = true_df.drop(['ctag'], axis=1)
    for i in range(len(pred_df)):
        true_i, shift = checkNeighbours(pred_df, true_df, n_neigh, i, shift, list_true_idx)
        if true_i is not None:
            record = pd.DataFrame(data={"pred_idx": [i], "true_idx": [true_i]})
            comparison = pd.concat([comparison, record], ignore_index=True)
            list_true_idx = true_i
    return comparison

def checkNeighbours(pred_df, true_df, n_neigh, i, shift, last_idx):
    t_len = len(true_df)
    t_index = i + shift
    for neighbour in range(-n_neigh, n_neigh + 1):
        if 0 < t_index + neighbour < t_len and\
                pred_df.iloc[i].equals(true_df.iloc[t_index + neighbour]) and\
                t_index + neighbour > last_idx:
            return t_index + neighbour, shift + neighbour
    return None, shift

def countNotNone(array):
    correct = 0
    incorrect = 0
    for cell in array:
        if cell is not None:
            correct += 1
        else:
            incorrect += 1
    print("Correct: " + str(correct))
    print("Incorrect: " + str(incorrect))

In [254]:
clarin_orto = clarin_df[["orth"]]
gold_orto = gold_df[["orth"]]
correct_orto = checkResult(clarin_orto, gold_orto, n_neigh=2)
correct_orto

orth    niefortunne
Name: 1, dtype: object
orth    niefortunne
Name: 1, dtype: object


orth    pudła
Name: 2, dtype: object
orth    pudła
Name: 2, dtype: object


orth    przygnębiły
Name: 3, dtype: object
orth    przygnębiły
Name: 3, dtype: object


orth    mnie
Name: 4, dtype: object
orth    mnie
Name: 4, dtype: object


orth    do
Name: 5, dtype: object
orth    do
Name: 5, dtype: object


orth    reszty
Name: 6, dtype: object
orth    reszty
Name: 6, dtype: object


orth    .
Name: 7, dtype: object
orth    .
Name: 7, dtype: object


orth    Zawsze
Name: 8, dtype: object
orth    Zawsze
Name: 8, dtype: object


orth    pretensje
Name: 10, dtype: object
orth    pretensje
Name: 11, dtype: object


orth    ,
Name: 11, dtype: object
orth    ,
Name: 12, dtype: object


orth    że
Name: 12, dtype: object
orth    że
Name: 13, dtype: object


orth    jestem
Name: 13, dtype: object
orth    jestem
Name: 14, dtype: object


orth    dobrym
Name: 14, dtype: object
orth    dobrym
Name: 15, dtype: o

Unnamed: 0,pred_idx,true_idx
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
...,...,...
2165,27418,27480
2166,27419,27481
2167,27442,27506
2168,27460,27525


In [255]:
print(clarin_df.loc[clarin_df.index.isin(correct_orto["pred_idx"]), ["orth"]])
print(gold_df.loc[gold_df.index.isin(correct_orto["true_idx"]), ["orth"]])

              orth
1      niefortunne
2            pudła
3      przygnębiły
4             mnie
5               do
...            ...
27418            ,
27419           że
27442            .
27460            ,
27469            ,

[2170 rows x 1 columns]
              orth
1      niefortunne
2            pudła
3      przygnębiły
4             mnie
5               do
...            ...
27480            ,
27481           że
27506            .
27525            ,
27534            ,

[2170 rows x 1 columns]


In [263]:
pred_df = clarin_df.loc[clarin_df.index.isin(correct_orto["pred_idx"]), ["orth", "base"]]
true_df = gold_df.loc[gold_df.index.isin(correct_orto["true_idx"]), ["orth", "base"]]

comparison = checkResult(pred_df, true_df, n_neigh=0)
comparison

Unnamed: 0,pred_idx,true_idx
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
...,...,...
2092,2165,2165
2093,2166,2166
2094,2167,2167
2095,2168,2168


In [264]:
pred_ctag_df = clarin_df.loc[clarin_df.index.isin(correct_orto["pred_idx"]), ["orth", "base", "ctag"]]
true_ctag_df = gold_df.loc[gold_df.index.isin(correct_orto["true_idx"]), ["orth", "base", "ctag"]]

comparison_ctag = checkResult(pred_ctag_df, true_ctag_df, n_neigh=0)
comparison_ctag

Unnamed: 0,pred_idx,true_idx
0,4,4
1,5,5
2,6,6
3,9,9
4,10,10
...,...,...
1735,2165,2165
1736,2166,2166
1737,2167,2167
1738,2168,2168


In [236]:
pred_df = pd.DataFrame(columns=["orth", "base", "ctag", "sentence"])
true_df = pd.DataFrame(columns=["orth", "base", "ctag", "sentence"])
pred_idx = correct_orto["pred_idx"]
true_idx = correct_orto["true_idx"]
for i in range(len(pred_idx)):
    pred_df = pd.concat([pred_df, clarin_df.iloc[pred_idx[i]]])
    true_df = pd.concat([true_df, gold_df.iloc[true_idx[i]]])