# BERT EVALUATION

### IMPORTS, VARIABLES

In [1]:
import conllu
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import time

pd.set_option('display.max_rows', None)

In [2]:
xpos_predictions = './bert/polXPOS-model/test_predictions.txt'
upos_predictions = './bert/polUPOS-model/test_predictions.txt'

xpos_standard = './bert/hist_test_XPOS/test.txt'
upos_standard = './bert/hist_test_UPOS/test.txt'

labels_xpos = './bert/data_XPOS/labels.txt'
labels_upos = './bert/data_UPOS/labels.txt'

test_upos_standard = './bert/data_UPOS/test.txt'
test_xpos_standard = './bert/data_XPOS/test.txt'

test_upos_predictions = './bert/test_UPOS/test_predictions.txt'
test_xpos_predictions = './bert/test_XPOS/test_predictions.txt'

In [3]:
from functions import get_measures, get_comparison

### FUNCTIONS

In [4]:
def get_labels(filename: str):
    '''A function that extracts labels/tags from a .txt file.
    
    Args:
        filename (str): The name of the file.
    
    Returns:
        A list of tags.
    '''
    with open(filename) as f:
        labels = f.readlines()[1:]
        labels = [x.strip() for x in labels]
        
    return labels

In [5]:
def get_tags_and_tokens(filename: str):
    '''A function that extracts tokens and the corresponding tags from a .txt file.
    
    Args:
        filename (str): The name of the file.
    
    Returns:
        A list of tokens and a list of tags.
    '''
    with open(filename) as f:
        tags = f.readlines()
        tokens = [x.strip().split()[0] for x in tags if len(x.strip()) > 0]
        tags = [x.strip().split()[1] for x in tags if len(x.strip()) > 0]
        
    return tokens, tags

### EXECUTION - STANDARD
Although the BERT tagger does return measures, it is not clear what kind of averaging is used for precision, recall, F1. For the sake of comparability, I want to get these measures again.

In [6]:
labels_upos_list = get_labels(labels_upos)
labels_xpos_list = get_labels(labels_xpos)

In [7]:
_, test_upos_standard_list = get_tags_and_tokens(test_upos_standard)
_, test_xpos_standard_list = get_tags_and_tokens(test_xpos_standard)

In [8]:
_, test_upos_predictions_list = get_tags_and_tokens(test_upos_predictions)
_, test_xpos_predictions_list = get_tags_and_tokens(test_xpos_predictions)

In [9]:
get_measures(test_upos_standard_list, test_upos_predictions_list, details=True)

MEASURES:
Accuracy: 99.05%
Precision (weighted): 99.05%
Recall (weighted): 99.05%
F1 (weighted): 99.05%
Matthew's Correlation Coefficient: 98.91%

MEASURES PER CLASS:
Precision:
	ADJ: 99.08%
	ADP: 99.86%
	ADV: 97.01%
	AUX: 98.46%
	CCONJ: 97.47%
	DET: 98.94%
	INTJ: 81.82%
	NOUN: 99.27%
	NUM: 97.76%
	PART: 97.44%
	PRON: 99.62%
	PROPN: 94.83%
	PUNCT: 99.93%
	SCONJ: 98.24%
	SYM: 100.00%
	VERB: 99.69%
	X: 95.00%
Recall:
	ADJ: 99.44%
	ADP: 99.91%
	ADV: 97.98%
	AUX: 98.93%
	CCONJ: 98.36%
	DET: 99.41%
	INTJ: 90.00%
	NOUN: 99.19%
	NUM: 98.87%
	PART: 94.91%
	PRON: 99.50%
	PROPN: 95.34%
	PUNCT: 99.95%
	SCONJ: 97.52%
	SYM: 25.00%
	VERB: 99.61%
	X: 92.23%



In [10]:
get_measures(test_xpos_standard_list, test_xpos_predictions_list, labels=labels_xpos_list)

MEASURES:
Accuracy: 95.35%
Precision (weighted): 94.79%
Recall (weighted): 95.35%
F1 (weighted): 94.95%
Matthew's Correlation Coefficient: 95.16%


### EXECUTION - PREDICTIONS

In [11]:
upos_token_list, upos_standard_list = get_tags_and_tokens(upos_standard)
xpos_token_list, xpos_standard_list = get_tags_and_tokens(xpos_standard)

In [12]:
_, upos_predictions_list = get_tags_and_tokens(upos_predictions)
_, xpos_predictions_list = get_tags_and_tokens(xpos_predictions)

In [13]:
get_measures(upos_standard_list, upos_predictions_list, details=True)

MEASURES:
Accuracy: 93.20%
Precision (weighted): 93.39%
Recall (weighted): 93.20%
F1 (weighted): 93.23%
Matthew's Correlation Coefficient: 92.29%

MEASURES PER CLASS:
Precision:
	ADJ: 90.29%
	ADP: 99.24%
	ADV: 88.99%
	AUX: 88.66%
	CCONJ: 98.47%
	DET: 92.31%
	NOUN: 94.31%
	NUM: 98.21%
	PART: 78.32%
	PRON: 93.71%
	PROPN: 77.29%
	PUNCT: 100.00%
	SCONJ: 90.24%
	VERB: 94.06%
	X: 86.54%
Recall:
	ADJ: 90.39%
	ADP: 98.74%
	ADV: 88.20%
	AUX: 83.07%
	CCONJ: 94.83%
	DET: 82.66%
	NOUN: 94.81%
	NUM: 83.33%
	PART: 85.51%
	PRON: 90.41%
	PROPN: 87.75%
	PUNCT: 100.00%
	SCONJ: 92.50%
	VERB: 95.06%
	X: 63.38%



In [14]:
get_measures(xpos_standard_list, xpos_predictions_list, labels=labels_xpos_list)

MEASURES:
Accuracy: 85.51%
Precision (weighted): 85.83%
Recall (weighted): 85.51%
F1 (weighted): 85.02%
Matthew's Correlation Coefficient: 85.04%


In [15]:
comparison_upos = get_comparison(upos_standard_list, upos_predictions_list, upos_token_list)
comparison_upos.to_excel('../data/mistakes/bert_UPOS_mistakes.xlsx')

In [16]:
comparison_xpos = get_comparison(xpos_standard_list, xpos_predictions_list, xpos_token_list)
comparison_xpos.to_excel('../data/mistakes/bert_XPOS_mistakes.xlsx')

In [17]:
comparison_upos

Unnamed: 0,Token,Gold Standard,Prediction
0,Melchior,PROPN,NOUN
1,nie,PART,CCONJ
2,wdokumentach,NOUN,ADV
3,Ronantowizna,PROPN,NOUN
4,Żółtowizna,PROPN,NOUN
5,Kmińszczyzna,PROPN,NOUN
6,Dobrzyńskiej,PROPN,ADJ
7,jedną,NUM,ADJ
8,było,AUX,VERB
9,było,AUX,VERB


In [18]:
comparison_xpos

Unnamed: 0,Token,Gold Standard,Prediction
0,urodzony,adj:sg:nom:m1:pos,ppas:sg:nom:m1:perf:aff
1,parafii,subst:sg:loc:f,subst:sg:gen:f
2,Rypnin,subst:sg:nom:m3,subst:sg:nom:m1
3,nie,part,conj
4,starszym,adj:sg:inst:m1:com,adj:sg:inst:m1:pos
5,p,brev:npun,brev:pun
6,wdokumentach,subst:pl:loc:m3,adj:pl:nom:m1:pos
7,adlinencjami,subst:pl:inst:f,subst:pl:inst:n:ncol
8,Ronantowizna,subst:sg:nom:f,subst:sg:gen:n:ncol
9,Żółtowizna,subst:sg:nom:f,subst:sg:inst:f
