# MORFEUSZ & CONCRAFT EVALUATION

### IMPORTS, VARIABLES

In [1]:
# https://github.com/kawu/concraft-pl
# https://github.com/kawu/concraft-pl/tree/master/bindings/python

In [2]:
import sys
import pandas as pd
import os
sys.path.append('../concraft-pl/bindings/python/')

pd.set_option('display.max_rows', None)

In [3]:
from morfeusz2 import Morfeusz
from concraft_pl2 import Concraft, Server

server = Server(model_path='../concraft-pl/model-SGJP.gz')

file = '../data/memoirs_3k_corrected.conllu'

test_file = '../data/ud-treebanks/UD_Polish-PDB/pl_pdb-ud-test.conllu'

In [4]:
morfeusz = Morfeusz(expand_tags=True)
concraft = Concraft()

In [5]:
from functions import *
from preproc_bert import remove_ranges

### FUNCTIONS AND CLASSES

In [6]:
class MorfeuszPredictions():
    '''A class that retrieves and stores the Morfeusz+Concraft annotations on a text, supplying it with information whether a 
    part of the text was misparsed. Can return simple lists of annotations or token-annotation dictionaries.
    
    Attributes:
        correct_preds (list[list[dict]]): A list of lists of dicts representing the tokens and annotations per input sentence.
        morfeusz: The Morfeusz instance used.
        concraft: The Concraft instance used.
        lemma [list[list[str]]]: A list of lists representing the lemmas assigned by Morfeusz.
        xpos [list[list[str]]]: A list of lists representing the XPOS tag assigned by Morfeusz.
        probability [list[list[str]]]: A list of lists representing the probabilities of the annotation.
        tokens [list[list[str]]]: The input tokens.
        sents [list[str]]: The input sentences.
    '''
    
    def __init__(self, test_tokens, sents, morfeusz, concraft):
        '''The init method of the class. Retrieves the annotations and stores them appropriately.
        
        Arguments:
            test_tokens (list[list[str]]): The input tokens.
            sents [list[str]]: The input sentences.
            morfeusz: The Morfeusz instance used.
            concraft: The Concraft instance used.
        '''
        # setting up the attributes
        self.correct_preds = []
        self.morfeusz = morfeusz
        self.concraft = concraft
        self.tokens = test_tokens
        self.sents = sents
        
        # retrieving annotations
        for i, sent in enumerate(tqdm(self.sents)):
            dag = self.morfeusz.analyse(sent)
            dag_disamb = self.concraft.disamb(dag)
            best_inter = best_interpretation(dag_disamb)
            # obtaining all the combinations of original sentence and annotations
            unique_combos = []
            for j, word in enumerate(self.tokens[i]):
                word_combos = []
                for k, inter in enumerate(best_inter):
                    word_combos.append((word,inter))
                unique_combos.append(word_combos)
            
            # retrieving the correct annotations, inserting special message if no annotation was found 
            fixed_sentence = []
            rejected_inters = 0
            for j in range(len(self.tokens[i])):
                current_word = self.tokens[i][j]
                current_options = unique_combos[j]
                winning_option = False

                for k in range(rejected_inters, len(current_options)):
                    option = current_options[k]
                    if option[0] == option[1][2][0] and not winning_option:  # if the forms match and it's the first match
                        # retrieving the lemma
                        if ':' in option[1][2][1] and len(option[1][2][1]) > 1:
                            lemma = option[1][2][1].split(':')[0]
                        else:
                            lemma = option[1][2][1]
                        winning_option = {
                            'form': option[0], 'lemma': lemma, 'xpos': option[1][2][2], 'probability': option[1][3]
                        }
                        rejected_inters = k  # all current_options are same length for a sentence, we reject already used ones

                if not winning_option:  # if there was no match between forms - data was misparsed
                    winning_option = {'form': current_word, 'lemma': 'MISPARSED', 'xpos': 'MISPARSED', 'probability': 'MISPARSED'}

                fixed_sentence.append(winning_option)
        
            self.correct_preds.append(fixed_sentence)
        # making lists of tags
        self.__make_lists()
            
    def __make_lists(self):
        '''A method of the class that retrieves the annotations from Morfeusz in the form of a list of lists.'''
        self.lemma = []
        self.xpos = []
        self.probability = []

        for i, sent in enumerate(self.correct_preds):
            sent_lemma_list = []
            sent_xpos_list = []
            sent_prob_list = []

            for k, entry in enumerate(sent):
                sent_lemma_list.append(entry['lemma'])
                sent_xpos_list.append(entry['xpos'])
                sent_prob_list.append(entry['probability'])

            self.lemma.append(sent_lemma_list)
            self.xpos.append(sent_xpos_list)
            self.probability.append(sent_prob_list)

### EXECUTION - MODERN

In [7]:
# importing the UD test data
test_tokens_upos, test_sents = extract_conllu_data(test_file, 'upos', sentences=True, combined=True)
test_tokens_xpos, _ = extract_conllu_data(test_file, 'xpos', sentences=True, combined=True)
test_tokens_lemmas, _ = extract_conllu_data(test_file, 'lemma', sentences=True, combined=True)

# transforming it to a tagging-friendly format
test_tokens, test_upos = make_tagger_friendly(test_tokens_upos)
_, test_xpos = make_tagger_friendly(test_tokens_xpos)
_, test_lemmas = make_tagger_friendly(test_tokens_lemmas)

In [8]:
test_preds = MorfeuszPredictions(test_tokens, test_sents, morfeusz, concraft)

100%|███████████████████████████████████████████████████████████████████████████████| 2215/2215 [00:29<00:00, 75.17it/s]


In [9]:
get_measures(test_xpos, test_preds.xpos)

MEASURES:
Accuracy: 94.43%
Precision (weighted): 95.36%
Recall (weighted): 94.43%
F1 (weighted): 94.56%
Matthew's Correlation Coefficient: 94.20%


In [10]:
get_lemma_measures(test_lemmas, test_preds.lemma)

Accuracy: 97.77%


In [11]:
get_lemma_measures(test_lemmas, test_preds.lemma, lowercase=True)

Accuracy: 98.37%


### EXECUTION - HISTORICAL

In [12]:
# importing the test data
tokens_upos, sents = extract_conllu_data(file, 'upos', sentences=True, combined=True)
tokens_xpos, _ = extract_conllu_data(file, 'xpos', sentences=True, combined=True)
tokens_lemmas, _ = extract_conllu_data(file, 'lemma', sentences=True, combined=True)

# transforming it to a tagging-friendly format
tokens, upos = make_tagger_friendly(tokens_upos)
_, xpos = make_tagger_friendly(tokens_xpos)
_, lemmas = make_tagger_friendly(tokens_lemmas)

In [13]:
hist_preds = MorfeuszPredictions(tokens, sents, morfeusz, concraft)

100%|█████████████████████████████████████████████████████████████████████████████████| 115/115 [00:03<00:00, 30.99it/s]


In [14]:
get_measures(xpos, hist_preds.xpos)

MEASURES:
Accuracy: 84.26%
Precision (weighted): 86.83%
Recall (weighted): 84.26%
F1 (weighted): 84.64%
Matthew's Correlation Coefficient: 83.76%


In [15]:
full_xpos = get_full_table(xpos, hist_preds.xpos, tokens, hist_preds.probability)
full_xpos.to_excel('../data/results/Morfeusz_XPOS.xlsx')

In [16]:
xpos_comparison = get_comparison(xpos, hist_preds.xpos, tokens, hist_preds.probability)
xpos_comparison.to_excel('../data/mistakes/Morfeusz_XPOS_mistakes.xlsx')

In [17]:
xpos_comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction,Confidence
0,Dziad,Dziad mój,subst:sg:nom:m1,subst:sg:acc:f,0.5388
1,mój,Dziad mój Melchior,adj:sg:nom:m1:pos,adj:sg:nom:m3:pos,0.7220
2,Melchior,mój Melchior urodzony,subst:sg:nom:m1,subst:sg:nom:m3,0.7171
3,urodzony,Melchior urodzony roku,ppas:sg:nom:m1:perf:aff,adj:sg:nom:m3:pos,0.7244
4,1741,roku 1741 we,adj:sg:gen:m3:pos,dig,1.0000
5,parafii,Godziszewo parafii Rypnin,subst:sg:loc:f,subst:sg:gen:f,0.9997
6,Rypnin,parafii Rypnin syn,subst:sg:nom:m3,subst:sg:nom:m1,1.0000
7,Komornika,Jana Komornika ziemi,subst:sg:gen:m1,subst:sg:acc:m1,0.9969
8,Antoniego,nie Antoniego –,subst:sg:gen:m1,subst:sg:acc:m1,0.9998
9,p,. p .,brev:pun,brev:npun,1.0000


In [18]:
get_lemma_measures(lemmas, hist_preds.lemma)

Accuracy: 91.01%


In [19]:
get_lemma_measures(lemmas, hist_preds.lemma, lowercase=True)

Accuracy: 94.22%


In [20]:
full_lemmas = get_full_table(lemmas, hist_preds.lemma, tokens)
full_lemmas.to_excel('../data/results/Morfeusz_lemmas.xlsx')

full_lemmas_lowercase = get_full_table(lemmas, hist_preds.lemma, tokens, lowercase=True)
full_lemmas_lowercase.to_excel('../data/results/Morfeusz_lowercase_lemmas.xlsx')

In [21]:
lemma_comparison = get_lemma_comparison(lemmas, hist_preds.lemma, tokens)
lemma_comparison.to_excel('../data/mistakes/Morfeusz_lemma_mistakes.xlsx')

lemma_comparison_lowercase = get_lemma_comparison(lemmas, hist_preds.lemma, tokens, lowercase=True)
lemma_comparison_lowercase.to_excel('../data/mistakes/Morfeusz_lowercase_lemma_mistakes.xlsx')

In [22]:
lemma_comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Dziad,Dziad mój,dziad,Dziad
1,Komornika,Jana Komornika ziemi,komornik,Komornik
2,Dobrrzyńskiej,ziemi Dobrrzyńskiej (,dobrzyńska,Dobrrzyńskiej
3,ś,bratem ś .,świętej,ś
4,p,. p .,pamięci,List_świętego_Piotra
5,wdokumentach,później wdokumentach się,wdokument,wdokumentach
6,Pinińskich,z Pinińskich właścicieli,Piniński,Pinińskich
7,Dóbr,właścicieli Dóbr Strużewo,dobra,Dobra
8,adlinencjami,z adlinencjami Puszczanki,adlinencja,adlinencjami
9,Puszczanki,"adlinencjami Puszczanki ,",Puszczanka,puszczanka


In [23]:
lemma_comparison_lowercase

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Dobrrzyńskiej,ziemi Dobrrzyńskiej (,dobrzyńska,dobrrzyńskiej
1,ś,bratem ś .,świętej,ś
2,p,. p .,pamięci,list_świętego_piotra
3,wdokumentach,później wdokumentach się,wdokument,wdokumentach
4,Pinińskich,z Pinińskich właścicieli,piniński,pinińskich
5,adlinencjami,z adlinencjami Puszczanki,adlinencja,adlinencjami
6,śp,że śp Dziad,świętej pamięci,śp
7,śp,miał śp .,świętej pamięci,świętej_pamięci
8,Głuską,z Głuską z,głuska,głuski
9,Floyrana,i Floyrana .,floyran,floyrana
