# MORFEUSZ & CONCRAFT EVALUATION

### IMPORTS, VARIABLES

In [1]:
# https://github.com/kawu/concraft-pl
# https://github.com/kawu/concraft-pl/tree/master/bindings/python

In [2]:
import sys
import pandas as pd
import os
sys.path.append('../concraft-pl/bindings/python/')

In [3]:
from morfeusz2 import Morfeusz
from concraft_pl2 import Concraft, Server

server = Server(model_path='../concraft-pl/model-SGJP.gz')

file = '../data/memoirs_3k_corrected.conllu'

test_file = '../data/ud-treebanks/UD_Polish-PDB/pl_pdb-ud-test.conllu'

In [4]:
morfeusz = Morfeusz(expand_tags=True)
concraft = Concraft()

In [5]:
from functions import *
from preproc_bert import remove_ranges

### FUNCTIONS AND CLASSES

In [6]:
class OriginalAnnotationsFromConllu:
    '''A class intended to process and store the tokens and their respective annotations from the original text. Unlike in 
    preprocessing with Morefusz, this time we only need to store some kinds of the annotation so that the ConlluFormatter class
    can work with it.
    
    Attributes:
        tokens (list[str]): A list of all the tokens (without annotation) in the original data. Every element of the list is a string.
        gold_standard (list[str]): A list of all the original annotations (without tokens). Every element is a string.
        sentences (list[str]): A list of the original sentences (either truly original, if available, or reconstructed).
        simple_sentences_tokenized (list[list[str]]): A list of lists representing the tokenized, unannotated sentences, with no regard
        as to whether the tokens were written together or not.
        simple_gold_standard_tokenized (list[list[str]]): A list of lists representing the gold standard annotations per sentence, 
        with no regard as to whether the tokens were written together or not.
    '''
    def __init__(self, sentences: list, simple_sentences_tokenized: list, simple_gold_standard_tokenized: list):
        '''The __init__ method of the class.
        Constructs the token and tag lists.
        
        Args:
            sentences (list[str]): A list of the original sentences (either truly original, if available, or reconstructed).
            simple_sentences_tokenized (list[list[str]]): A list of lists representing the tokenized, unannotated sentences, with no regard
            as to whether the tokens were written together or not.
            simple_gold_standard_tokenized (list[list[str]]): A list of lists representing the gold standard annotations per sentence, 
            with no regard as to whether the tokens were written together or not.
        '''
        self.sentences = sentences
        self.simple_sentences_tokenized = simple_sentences_tokenized
        self.simple_gold_standard_tokenized = simple_gold_standard_tokenized
        
        self.tokens = [x for sentence in self.simple_sentences_tokenized for x in sentence]
        self.gold_standard = [x for sentence in self.simple_gold_standard_tokenized for x in sentence]
        
    def __len__(self):
        '''The __len__ magic method of the class.
            
        Returns:
            The length of all the elements in all sentences.
        '''
        return len(self.tokens)
    
    def __getitem__(self, index: int):
        '''The __getitem__ magic method of the class.
            
        Args:
            index (int): The index signifying the desired element.
            
        Returns:
            A string representing the combination of the original token and annotation.
        '''
        token = self.tokens[index]
        annotation = self.gold_standard[index]
        item = '_'.join([token, annotation])
            
        return item
    
    def frequencies(self):
        '''A method of the class indended for displaying raw and relative frequencies of word classes in the annotation.
        
        Returns:
            A dataframe representing the POS tag, raw frequency, relative frequency.
        '''
        freqs = []
        for item in list(set(self.gold_standard)):
            raw = self.gold_standard.count(item)
            relative = raw / len(self.gold_standard)
            
            freqs.append([item, raw, relative])
            
        freq_pd = pd.DataFrame(freqs, columns=['POS', 'raw', 'relative']).sort_values('relative', ascending=False).set_index('POS')
            
        return freq_pd

In [7]:
def make_tagger_friendly(tokens_tags):
    '''A function allowing for the use of split_tags_and_tokens and remove_ranges on nested lists.
    
    Arguments:
        token_tags (list[list]): A list of lists representing sentences with annotations.
        
    Returns:
        Two separate lists of lists representing sentences and their annotations respectively.'''
    tokens = []
    tags = []
    for element in tokens_tags:
        mini_tokens, mini_tags = split_tags_and_tokens(remove_ranges(element))
        tokens.append(mini_tokens)
        tags.append(mini_tags)
        
    return tokens, tags

In [8]:
def reject_misparsed(standards, preds, tokens):
    new_standards = []
    new_preds = []
    new_tokens = []
    for i, sent in enumerate(standards):
        
        if len(standards[i]) == len(preds[i]):
            new_standards.append(standards[i])
            new_preds.append(preds[i])
            new_tokens.append(tokens[i])
        else:
            print(f'Deleted entry number {i} due to a parsing error.')
            
    return new_standards, new_preds, new_tokens
            

### EXECUTION - MODERN

In [9]:
# importing the UD test data
test_tokens_upos, sents = extract_conllu_data(test_file, 'upos', sentences=True, combined=True)
test_tokens_xpos, _ = extract_conllu_data(test_file, 'xpos', sentences=True, combined=True)
test_tokens_lemmas, _ = extract_conllu_data(test_file, 'lemma', sentences=True, combined=True)

# transforming it to a tagging-friendly format
test_tokens, test_upos = make_tagger_friendly(test_tokens_upos)
_, test_xpos = make_tagger_friendly(test_tokens_xpos)
_, test_lemmas = make_tagger_friendly(test_tokens_lemmas)

In [10]:
test_original = OriginalAnnotationsFromConllu(sents, test_tokens, test_upos)

In [11]:
test_anns = ConlluFormatter(test_original, morfeusz, concraft)

100%|███████████████████████████████████████████████████████████████████████████████| 2215/2215 [00:23<00:00, 94.54it/s]


In [12]:
test_xpos_preds, test_forms = test_anns.retrieve_anns('xpos')

In [13]:
test_xpos, test_xpos_preds, _ = reject_misparsed(test_xpos, test_xpos_preds, test_tokens)

Deleted entry number 251 due to a parsing error.
Deleted entry number 619 due to a parsing error.
Deleted entry number 646 due to a parsing error.
Deleted entry number 688 due to a parsing error.
Deleted entry number 693 due to a parsing error.
Deleted entry number 804 due to a parsing error.
Deleted entry number 906 due to a parsing error.
Deleted entry number 968 due to a parsing error.
Deleted entry number 1049 due to a parsing error.
Deleted entry number 1127 due to a parsing error.
Deleted entry number 1186 due to a parsing error.
Deleted entry number 1196 due to a parsing error.
Deleted entry number 1296 due to a parsing error.
Deleted entry number 1403 due to a parsing error.
Deleted entry number 1461 due to a parsing error.
Deleted entry number 1495 due to a parsing error.
Deleted entry number 1496 due to a parsing error.
Deleted entry number 1634 due to a parsing error.
Deleted entry number 1681 due to a parsing error.
Deleted entry number 1698 due to a parsing error.
Deleted 

In [14]:
get_measures([x for sentence in test_xpos for x in sentence], [x for sentence in test_xpos_preds for x in sentence])

MEASURES:
Accuracy: 94.58%
Precision (weighted): 95.39%
Recall (weighted): 94.58%
F1 (weighted): 94.66%
Matthew's Correlation Coefficient: 94.37%


In [None]:
test_lemma_preds, test_forms = test_anns.retrieve_anns('lemma')

In [15]:
test_lemmas, test_lemma_preds, _ = reject_misparsed(test_lemmas, test_lemma_preds, test_tokens)

Deleted entry number 251 due to a parsing error.
Deleted entry number 619 due to a parsing error.
Deleted entry number 646 due to a parsing error.
Deleted entry number 688 due to a parsing error.
Deleted entry number 693 due to a parsing error.
Deleted entry number 804 due to a parsing error.
Deleted entry number 881 due to a parsing error.
Deleted entry number 906 due to a parsing error.
Deleted entry number 968 due to a parsing error.
Deleted entry number 1049 due to a parsing error.
Deleted entry number 1127 due to a parsing error.
Deleted entry number 1186 due to a parsing error.
Deleted entry number 1196 due to a parsing error.
Deleted entry number 1296 due to a parsing error.
Deleted entry number 1403 due to a parsing error.
Deleted entry number 1461 due to a parsing error.
Deleted entry number 1495 due to a parsing error.
Deleted entry number 1496 due to a parsing error.
Deleted entry number 1634 due to a parsing error.
Deleted entry number 1681 due to a parsing error.
Deleted e

In [16]:
get_lemma_measures(test_lemmas, test_lemma_preds)

Accuracy: 98.23%


### EXECUTION - HISTORICAL

In [17]:
# importing the test data
tokens_upos, sents = extract_conllu_data(file, 'upos', sentences=True, combined=True)
tokens_xpos, _ = extract_conllu_data(file, 'xpos', sentences=True, combined=True)
tokens_lemmas, _ = extract_conllu_data(file, 'lemma', sentences=True, combined=True)

# transforming it to a tagging-friendly format
tokens, upos = make_tagger_friendly(tokens_upos)
_, xpos = make_tagger_friendly(tokens_xpos)
_, lemmas = make_tagger_friendly(tokens_lemmas)

In [18]:
original = OriginalAnnotationsFromConllu(sents, tokens, upos)

In [19]:
anns = ConlluFormatter(original, morfeusz, concraft)

100%|█████████████████████████████████████████████████████████████████████████████████| 115/115 [00:03<00:00, 38.32it/s]


In [20]:
xpos_preds, forms = anns.retrieve_anns('xpos')

In [21]:
xpos, xpos_preds, new_tokens = reject_misparsed(xpos, xpos_preds, tokens)

Deleted entry number 48 due to a parsing error.
Deleted entry number 49 due to a parsing error.


In [22]:
xpos = [x for sentence in xpos for x in sentence]
xpos_preds = [x for sentence in xpos_preds for x in sentence]
new_tokens = [x for sentence in new_tokens for x in sentence]

In [23]:
get_measures(xpos, xpos_preds)

MEASURES:
Accuracy: 87.85%
Precision (weighted): 89.79%
Recall (weighted): 87.85%
F1 (weighted): 88.17%
Matthew's Correlation Coefficient: 87.46%


In [24]:
xpos_comparison = get_comparison(xpos, xpos_preds, new_tokens)
xpos_comparison.to_excel('../data/mistakes/Morfeusz_XPOS_mistakes.xlsx')

In [25]:
xpos_comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Dziad,Dziad mój,subst:sg:nom:m1,subst:sg:acc:f
1,mój,Dziad mój Melchior,adj:sg:nom:m1:pos,adj:sg:nom:m3:pos
2,Melchior,mój Melchior urodzony,subst:sg:nom:m1,subst:sg:nom:m3
3,urodzony,Melchior urodzony roku,adj:sg:nom:m1:pos,adj:sg:nom:m3:pos
4,1741,roku 1741 we,adj:sg:gen:m3:pos,dig
...,...,...,...,...
389,secundo,Julia secundo voto,ign,subst:sg:nom:f
390,voto,secundo voto Szołayska,ign,part
391,ich,miała ich też,ppron3:pl:acc:m3:ter:akc:npraep,ppron3:pl:acc:m1:ter:akc:npraep
392,Lesniowic,do Lesniowic przyległe,subst:pl:gen:n:pt,subst:sg:gen:n:ncol


In [None]:
lemma_preds, forms = anns.retrieve_anns('lemma')

In [26]:
lemmas, lemma_preds, new_tokens = reject_misparsed(lemmas, lemma_preds, tokens)

Deleted entry number 48 due to a parsing error.
Deleted entry number 49 due to a parsing error.


In [27]:
get_lemma_measures(lemmas, lemma_preds)

Accuracy: 95.31%


In [28]:
lemma_comparison = get_lemma_comparison(lemmas, lemma_preds, new_tokens)
lemma_comparison.to_excel('../data/mistakes/Morfeusz_lemma_mistakes.xlsx')

In [29]:
lemma_comparison

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,dobrrzyńskiej,ziemi dobrrzyńskiej (,dobrzyńska,dobrrzyńskiej
1,p,. p .,p,list_świętego_piotra
2,wdokumentach,później wdokumentach się,dokument,wdokumentach
3,pinińskich,z pinińskich właścicieli,Piniński,Pinińskich
4,adlinencjami,z adlinencjami puszczanki,adlinencja,adlinencjami
...,...,...,...,...
147,ciepłey,ale ciepłey wdowy,ciepła,ciepłey
148,szołayskiego,pana szołayskiego młodzika,Szołayski,Szołayskiego
149,procesa,lubiła procesa –,proces,procesa
150,lesniowic,do lesniowic przyległe,Lesniowice,Lesniowic
