# BERT EVALUATION

### IMPORTS, VARIABLES

In [1]:
import conllu
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import time

pd.set_option('display.max_rows', None)

In [2]:
xpos_predictions = './bert/polXPOS-model/test_predictions.txt'
upos_predictions = './bert/polUPOS-model/test_predictions.txt'
file_3k = '../data/memoirs_3k_corrected.conllu'
file_10k = '../data/memoirs_10k_corrected.conllu'

labels_xpos = './bert/data_XPOS/labels.txt'
labels_upos = './bert/data_UPOS/labels.txt'

In [3]:
from functions import extract_conllu_data, get_measures

### FUNCTIONS

In [4]:
def get_labels(filename: str):
    with open(filename) as f:
        labels = f.readlines()[1:]
        labels = [x.strip() for x in labels]
        
    return labels

In [5]:
def get_preds(filename: str):
    with open(filename) as f:
        preds = f.readlines()
        preds = [x.strip().split()[1] for x in preds if len(x.strip()) > 0]
        
    return preds

In [6]:
def get_comparison(standard: list, predictions: list, tokens: list):
    '''A function that calculates and prints out the accuracy of the lemmatization.
    
    Args:
        standard (list): A list of lists of gold standard lemmas.
        predictions (list): A list of lists of predicted lemmas.
    
    Returns:
        A Pandas dataframe containing the mismatched lemmas.
    '''
    
    problematic = []
    for i, ann in enumerate(predictions):
        if standard[i] != ann:
            problematic.append((tokens[i], standard[i], predictions[i]))
            
    problematic_frame = pd.DataFrame(problematic, columns=['Token', 'Gold Standard', 'Prediction'])
    
    return problematic_frame

### EXECUTION

In [7]:
labels_upos_list = get_labels(labels_upos)
labels_xpos_list = get_labels(labels_xpos)

In [8]:
upos_token_list, upos_standard_list, _ = extract_conllu_data(file_10k,'upos', sentences=False)
xpos_token_list, xpos_standard_list, _ = extract_conllu_data(file_3k, 'xpos', sentences=False)

In [9]:
upos_predictions_list = get_preds(upos_predictions)
xpos_predictions_list = get_preds(xpos_predictions)

In [10]:
# get upos preds to work
len(xpos_standard_list) == len(xpos_predictions_list)

True

In [11]:
print(len(xpos_token_list))

3280


In [12]:
print(xpos_standard_list[:10])
print(xpos_predictions_list[:10])

['subst:sg:nom:m1', 'adj:sg:nom:m1:pos', 'subst:sg:nom:m1', 'adj:sg:nom:m1:pos', 'subst:sg:gen:m3', 'adj:sg:gen:m3', 'prep:loc:wok', 'subst:sg:loc:f', 'subst:sg:nom:n:ncol', 'subst:sg:loc:f']
['subst:sg:nom:m1', 'adj:sg:nom:m1:pos', 'subst:sg:nom:m1', 'ppas:sg:nom:m1:perf:aff', 'subst:sg:gen:m3', 'adj:sg:gen:m3:pos', 'prep:loc:wok', 'subst:sg:loc:f', 'subst:sg:nom:n:ncol', 'subst:sg:gen:f']


In [13]:
# needed for replacing '_'s
for i,item in enumerate(xpos_standard_list):
    if item == None:
        xpos_standard_list[i] = '_'

In [14]:
sklearn.metrics.accuracy_score(xpos_standard_list, xpos_predictions_list)

0.8484756097560976

In [15]:
get_measures(xpos_standard_list, xpos_predictions_list, labels=labels_xpos_list)

MEASURES:
Accuracy: 84.85%
Matthew's Correlation Coefficient: 84.37%

MEASURES PER CLASS:
Precision:
	_: 90.00%
	adj:pl:acc:f:com: 0.00%
	adj:pl:acc:f:pos: 0.00%
	adj:pl:acc:f:sup: 0.00%
	adj:pl:acc:m1:com: 0.00%
	adj:pl:acc:m1:pos: 0.00%
	adj:pl:acc:m1:sup: 0.00%
	adj:pl:acc:m2:pos: 0.00%
	adj:pl:acc:m2:sup: 0.00%
	adj:pl:acc:m3:com: 0.00%
	adj:pl:acc:m3:pos: 75.00%
	adj:pl:acc:m3:sup: 0.00%
	adj:pl:acc:n:com: 0.00%
	adj:pl:acc:n:pos: 0.00%
	adj:pl:acc:n:sup: 0.00%
	adj:pl:dat:f:com: 0.00%
	adj:pl:dat:f:pos: 0.00%
	adj:pl:dat:m1:com: 0.00%
	adj:pl:dat:m1:pos: 0.00%
	adj:pl:dat:m1:sup: 0.00%
	adj:pl:dat:m2:pos: 0.00%
	adj:pl:dat:m3:com: 0.00%
	adj:pl:dat:m3:pos: 0.00%
	adj:pl:dat:n:pos: 0.00%
	adj:pl:dat:n:sup: 0.00%
	adj:pl:gen:f:com: 0.00%
	adj:pl:gen:f:pos: 100.00%
	adj:pl:gen:f:sup: 0.00%
	adj:pl:gen:m1:com: 0.00%
	adj:pl:gen:m1:pos: 37.50%
	adj:pl:gen:m1:sup: 0.00%
	adj:pl:gen:m2:pos: 100.00%
	adj:pl:gen:m2:sup: 0.00%
	adj:pl:gen:m3:com: 0.00%
	adj:pl:gen:m3:pos: 50.00%
	adj:pl:ge

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
get_measures(upos_standard_list[:3280], upos_predictions_list, labels_upos_list)

MEASURES:
Accuracy: 93.26%
Matthew's Correlation Coefficient: 92.32%

MEASURES PER CLASS:
Precision:
	ADJ: 89.67%
	ADP: 99.70%
	ADV: 90.91%
	AUX: 86.36%
	CCONJ: 99.35%
	DET: 93.94%
	INTJ: 0.00%
	NOUN: 95.07%
	NUM: 97.22%
	PART: 81.82%
	PRON: 90.54%
	PROPN: 73.12%
	PUNCT: 99.09%
	SCONJ: 93.06%
	SYM: 0.00%
	VERB: 96.46%
	X: 79.31%
	_: 54.55%
Recall:
	ADJ: 89.30%
	ADP: 98.53%
	ADV: 93.02%
	AUX: 83.82%
	CCONJ: 93.90%
	DET: 82.67%
	INTJ: 0.00%
	NOUN: 94.25%
	NUM: 81.40%
	PART: 85.71%
	PRON: 93.06%
	PROPN: 87.26%
	PUNCT: 99.77%
	SCONJ: 97.10%
	SYM: 0.00%
	VERB: 96.22%
	X: 60.53%
	_: 66.67%



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
print(upos_token_list)

['Dziad', 'mój', 'Melchior', 'urodzony', 'roku', '1741', 'we', 'wsi', 'Godziszewo', 'parafii', 'Rypnin', 'syn', 'komornika', 'ziemskiego', 'Jana', 'Komornika', 'ziemi', 'Dobrrzyńskiej', '(', 'a', 'nie', 'Antoniego', '–', 'Antoni', 'był', 'starszym', 'bratem', 'ś', '.', 'p', '.', 'Melchiora', '–', 'i', 'nie', 'rozumiem', 'skąd', 'się', 'wzięła', 'ta', 'myłka', ',', 'która', 'później', 'wdokumentach', 'się', 'powtarza', ')', 'i', 'matki', 'Ewy', 'z', 'Pinińskich', 'właścicieli', 'Dóbr', 'Strużewo', 'z', 'adlinencjami', 'Puszczanki', ',', 'Dąbrowy', 'części', ',', 'Ronantowizna', ',', 'Żółtowizna', ',', 'Będowszczyazna', ',', 'Kmińszczyzna', ',', 'Bęklowizna', ',', 'Ruszkowizna', '.', 'A', 'że', 'śp', 'Dziad', 'mój', 'był', 'najmłodszy', 'i', 'piąty', 'z', 'pomiędzy', 'Rodzeństwa', '–', 'odstąpił', 'swoją', 'część', 'najstarszemu', 'bratu', 'swojemu', 'Mateuszowi', 'komornikowi', 'ziemi', 'Dobrzyńskiej', 'za', 'ośm', 'tysięcy', 'tynfów', '.', 'Byli', 'jeszcze', 'bracia', 'Wawrzyniec', ','

In [18]:
comparison_upos = get_comparison(upos_standard_list, upos_predictions_list, upos_token_list)
#comparison.to_excel('../data/stanza_mistakes.xlsx')

In [19]:
comparison_xpos = get_comparison(xpos_standard_list, xpos_predictions_list, xpos_token_list)
#comparison.to_excel('../data/stanza_mistakes.xlsx')

In [20]:
comparison_upos

Unnamed: 0,Token,Gold Standard,Prediction
0,Melchior,PROPN,NOUN
1,wdokumentach,NOUN,ADV
2,części,NOUN,PROPN
3,że,SCONJ,PART
4,Dobrzyńskiej,PROPN,ADJ
5,jedną,NUM,ADJ
6,było,AUX,VERB
7,było,AUX,VERB
8,Niewiem,VERB,NOUN
9,jego,DET,PRON


In [21]:
comparison_xpos

Unnamed: 0,Token,Gold Standard,Prediction
0,urodzony,adj:sg:nom:m1:pos,ppas:sg:nom:m1:perf:aff
1,1741,adj:sg:gen:m3,adj:sg:gen:m3:pos
2,parafii,subst:sg:loc:f,subst:sg:gen:f
3,Rypnin,subst:sg:nom:m3,subst:sg:nom:m1
4,nie,part,conj
5,starszym,adj:sg:inst:m1:com,adj:sg:inst:m1:pos
6,p,brev:npun,brev:pun
7,wdokumentach,subst:pl:loc:m3,subst:pl:dat:m1
8,Bęklowizna,subst:sg:nom:f,subst:sg:inst:f
9,Ruszkowizna,subst:sg:nom:f,subst:sg:gen:m3
