# MARMOT EVALUATION

### IMPORTS, VARIABLES

In [1]:
import conllu
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import time

pd.set_option('display.max_rows', None)

In [2]:
xpos_predictions = './marmot/test_hist_XPOS.out.txt'
upos_predictions = './marmot/test_hist_UPOS.out.txt'

xpos_standard = './marmot/test_hist_XPOS.txt'
upos_standard = './marmot/test_hist_UPOS.txt'

test_predictions = './marmot/test.out.txt'
test_standard = './marmot/test.txt'

In [3]:
from functions import get_measures, extract_conllu_data, get_comparison

### FUNCTIONS

In [4]:
def get_marmot_tags_and_tokens(filename: str):
    with open(filename) as f:
        tags = f.readlines()
        tokens = [x.strip().split()[0] for x in tags if len(x.strip()) > 0]
        upos = [x.strip().split()[1] for x in tags if len(x.strip()) > 0]
        xpos = [x.strip().split()[2] for x in tags if len(x.strip()) > 0]
        
    return tokens, upos, xpos

### EXECUTION - MODERN
Since unlike the BERT-based tagger Marmot does not output any training/evaluation/testing measures, I need to obtain them myself from contemporary test data.

In [5]:
# Marmot returns files in a CoNLL-2009 format; it can still be parsed using conllu and my function, but the names of
# the features do not match; here 'feats' corresponds to UPOS tags and 'deprel' to XPOS.
test_tokens, test_upos_preds = extract_conllu_data(test_predictions, 'feats', sentences=False, fulltext=False)
test_tokens, test_xpos_preds = extract_conllu_data(test_predictions, 'deprel', sentences=False, fulltext=False)
# Due to the issue mentioned above, UPOS tags need to be further processed.
test_upos_preds = [list(x.keys())[0] for x in test_upos_preds]

In [6]:
# Retrieving the gold standard tags
_, test_upos_standard, test_xpos_standard = get_marmot_tags_and_tokens(test_standard)

In [7]:
get_measures(test_upos_standard, test_upos_preds, details=True)

MEASURES:
Accuracy: 97.73%
Precision (weighted): 97.75%
Recall (weighted): 97.73%
F1 (weighted): 97.73%
Matthew's Correlation Coefficient: 97.38%

MEASURES PER CLASS:
Precision:
	ADJ: 97.25%
	ADP: 99.46%
	ADV: 95.59%
	AUX: 91.67%
	CCONJ: 96.17%
	DET: 98.44%
	INTJ: 46.15%
	NOUN: 98.23%
	NUM: 98.04%
	PART: 93.49%
	PRON: 99.05%
	PROPN: 91.30%
	PUNCT: 99.95%
	SCONJ: 96.49%
	SYM: 100.00%
	VERB: 97.96%
	X: 89.33%
Recall:
	ADJ: 97.71%
	ADP: 99.74%
	ADV: 95.33%
	AUX: 95.60%
	CCONJ: 96.26%
	DET: 96.93%
	INTJ: 60.00%
	NOUN: 98.04%
	NUM: 94.34%
	PART: 92.42%
	PRON: 98.31%
	PROPN: 94.09%
	PUNCT: 99.95%
	SCONJ: 96.21%
	SYM: 25.00%
	VERB: 97.43%
	X: 86.73%



In [8]:
get_measures(test_xpos_standard, test_xpos_preds)  # measures per class are not as informative here

MEASURES:
Accuracy: 89.27%
Precision (weighted): 88.95%
Recall (weighted): 89.27%
F1 (weighted): 88.81%
Matthew's Correlation Coefficient: 88.83%


### EXECUTION - HISTORICAL

In [9]:
# The same is done with the historical data.
hist_upos_tokens, hist_upos_preds = extract_conllu_data(upos_predictions, 'feats', sentences=False, fulltext=False)
hist_xpos_tokens, hist_xpos_preds = extract_conllu_data(xpos_predictions, 'deprel', sentences=False, fulltext=False)

hist_upos_preds = [list(x.keys())[0] for x in hist_upos_preds]

In [10]:
# Retrieving the gold standard tags - since we have two different files for XPOS and UPOS annotations we need two function calls
_, hist_upos_standard, _ = get_marmot_tags_and_tokens(upos_standard)
_, _, hist_xpos_standard = get_marmot_tags_and_tokens(xpos_standard)

In [11]:
get_measures(hist_upos_standard, hist_upos_preds, details=True)

MEASURES:
Accuracy: 90.19%
Precision (weighted): 90.35%
Recall (weighted): 90.19%
F1 (weighted): 90.18%
Matthew's Correlation Coefficient: 88.86%

MEASURES PER CLASS:
Precision:
	ADJ: 80.48%
	ADP: 99.49%
	ADV: 85.03%
	AUX: 82.26%
	CCONJ: 97.20%
	DET: 93.47%
	NOUN: 89.12%
	NUM: 96.97%
	PART: 76.38%
	PRON: 90.87%
	PROPN: 78.65%
	PUNCT: 100.00%
	SCONJ: 86.26%
	VERB: 91.32%
	X: 63.16%
Recall:
	ADJ: 84.61%
	ADP: 98.74%
	ADV: 83.78%
	AUX: 85.83%
	CCONJ: 95.94%
	DET: 74.82%
	NOUN: 90.79%
	NUM: 72.73%
	PART: 73.43%
	PRON: 85.71%
	PROPN: 86.04%
	PUNCT: 100.00%
	SCONJ: 91.00%
	VERB: 91.88%
	X: 50.70%



In [12]:
get_measures(hist_xpos_standard, hist_xpos_preds)

MEASURES:
Accuracy: 78.78%
Precision (weighted): 79.73%
Recall (weighted): 78.78%
F1 (weighted): 78.39%
Matthew's Correlation Coefficient: 78.11%


In [13]:
comparison_upos = get_comparison(hist_upos_standard, hist_upos_preds, hist_upos_tokens)
comparison_upos.to_excel('../data/mistakes/marmot_UPOS_mistakes.xlsx')

In [14]:
comparison_xpos = get_comparison(hist_xpos_standard, hist_xpos_preds, hist_xpos_tokens)
comparison_xpos.to_excel('../data/mistakes/marmot_XPOS_mistakes.xlsx')

In [15]:
comparison_upos

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Melchior,mój Melchior urodzony,PROPN,NOUN
1,Komornika,Jana Komornika ziemi,NOUN,PROPN
2,ś,bratem ś .,X,AUX
3,Pinińskich,z Pinińskich właścicieli,PROPN,ADJ
4,Dóbr,właścicieli Dóbr Strużewo,NOUN,PROPN
5,Dąbrowy,", Dąbrowy części",PROPN,ADJ
6,śp,że śp Dziad,X,NOUN
7,Dobrzyńskiej,ziemi Dobrzyńskiej za,PROPN,ADJ
8,Panna,siostra Panna Urszula,NOUN,PROPN
9,śp,miał śp .,X,NOUN


In [16]:
comparison_xpos

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Dziad,Dziad mój,subst:sg:nom:m1,subst:sg:nom:m3
1,mój,Dziad mój Melchior,adj:sg:nom:m1:pos,adj:sg:nom:m3:pos
2,Melchior,mój Melchior urodzony,subst:sg:nom:m1,subst:sg:nom:m3
3,urodzony,Melchior urodzony roku,adj:sg:nom:m1:pos,ppas:sg:nom:m3:perf:aff
4,Godziszewo,wsi Godziszewo parafii,subst:sg:nom:n:ncol,subst:sg:acc:n:ncol
5,parafii,Godziszewo parafii Rypnin,subst:sg:loc:f,subst:sg:gen:f
6,Rypnin,parafii Rypnin syn,subst:sg:nom:m3,subst:sg:nom:m1
7,starszym,był starszym bratem,adj:sg:inst:m1:com,adj:sg:inst:m3:pos
8,bratem,starszym bratem ś,subst:sg:inst:m1,subst:sg:inst:m3
9,ś,bratem ś .,brev:pun,aglt:sg:sec:imperf:nwok
