# BERT EVALUATION

### IMPORTS, VARIABLES

In [1]:
import conllu
import sklearn.metrics
import pandas as pd
import matplotlib.pyplot as plt
import time

pd.set_option('display.max_rows', None)

In [2]:
xpos_predictions = './bert/polXPOS-model/test_predictions.txt'
upos_predictions = './bert/polUPOS-model/test_predictions.txt'

xpos_standard = './bert/hist_test_XPOS/test.txt'
upos_standard = './bert/hist_test_UPOS/test.txt'

labels_xpos = './bert/data_XPOS/labels.txt'
labels_upos = './bert/data_UPOS/labels.txt'

test_upos_standard = './bert/data_UPOS/test.txt'
test_xpos_standard = './bert/data_XPOS/test.txt'

test_upos_predictions = './bert/test_UPOS/test_predictions.txt'
test_xpos_predictions = './bert/test_XPOS/test_predictions.txt'

In [3]:
from functions import *

### FUNCTIONS

In [4]:
def get_labels(filename: str):
    '''A function that extracts labels/tags from a .txt file.
    
    Args:
        filename (str): The name of the file.
    
    Returns:
        A list of tags.
    '''
    with open(filename) as f:
        labels = f.readlines()[1:]
        labels = [x.strip() for x in labels]
        
    return labels

In [5]:
def get_tags_and_tokens(filename: str):
    '''A function that extracts tokens and the corresponding tags from a .txt file.
    
    Args:
        filename (str): The name of the file.
    
    Returns:
        A list of tokens and a list of tags.
    '''
    with open(filename) as f:
        tags = f.readlines()
        tokens, tags = split_tags_and_tokens(tags)
        
    return tokens, tags

### EXECUTION - MODERN
Although the BERT tagger does return measures, it is not clear what kind of averaging is used for precision, recall, F1. For the sake of comparability, I want to get these measures again.

In [6]:
labels_upos_list = get_labels(labels_upos)
labels_xpos_list = get_labels(labels_xpos)

In [7]:
_, test_upos_standard_list = get_tags_and_tokens(test_upos_standard)
_, test_xpos_standard_list = get_tags_and_tokens(test_xpos_standard)

In [8]:
_, test_upos_predictions_list = get_tags_and_tokens(test_upos_predictions)
_, test_xpos_predictions_list = get_tags_and_tokens(test_xpos_predictions)

In [9]:
get_measures(test_upos_standard_list, test_upos_predictions_list, details=True)

MEASURES:
Accuracy: 99.20%
Precision (weighted): 99.20%
Recall (weighted): 99.20%
F1 (weighted): 99.20%
Matthew's Correlation Coefficient: 99.08%

MEASURES PER CLASS:
Precision:
	ADJ: 99.11%
	ADP: 99.77%
	ADV: 97.73%
	AUX: 98.93%
	CCONJ: 97.64%
	DET: 98.82%
	INTJ: 87.50%
	NOUN: 99.58%
	NUM: 97.06%
	PART: 97.14%
	PRON: 99.62%
	PROPN: 95.88%
	PUNCT: 99.96%
	SCONJ: 98.68%
	SYM: 50.00%
	VERB: 99.72%
	X: 95.62%
Recall:
	ADJ: 99.55%
	ADP: 99.91%
	ADV: 98.44%
	AUX: 98.93%
	CCONJ: 97.99%
	DET: 99.06%
	INTJ: 70.00%
	NOUN: 99.26%
	NUM: 99.62%
	PART: 95.12%
	PRON: 99.44%
	PROPN: 98.12%
	PUNCT: 99.98%
	SCONJ: 98.40%
	SYM: 25.00%
	VERB: 99.72%
	X: 91.91%



In [10]:
get_measures(test_xpos_standard_list, test_xpos_predictions_list, labels=labels_xpos_list)

MEASURES:
Accuracy: 95.65%
Precision (weighted): 95.13%
Recall (weighted): 95.65%
F1 (weighted): 95.29%
Matthew's Correlation Coefficient: 95.47%


### EXECUTION - HISTORICAL

In [11]:
upos_token_list, upos_standard_list = get_tags_and_tokens(upos_standard)
xpos_token_list, xpos_standard_list = get_tags_and_tokens(xpos_standard)

In [12]:
_, upos_predictions_list = get_tags_and_tokens(upos_predictions)
_, xpos_predictions_list = get_tags_and_tokens(xpos_predictions)

In [13]:
get_measures(upos_standard_list, upos_predictions_list, details=True)

MEASURES:
Accuracy: 94.36%
Precision (weighted): 94.58%
Recall (weighted): 94.36%
F1 (weighted): 94.40%
Matthew's Correlation Coefficient: 93.61%

MEASURES PER CLASS:
Precision:
	ADJ: 94.12%
	ADP: 99.66%
	ADV: 87.61%
	AUX: 89.26%
	CCONJ: 98.84%
	DET: 93.73%
	NOUN: 95.51%
	NUM: 98.21%
	PART: 79.45%
	PRON: 93.69%
	PROPN: 83.58%
	PUNCT: 99.59%
	SCONJ: 87.44%
	VERB: 95.42%
	X: 82.76%
Recall:
	ADJ: 93.41%
	ADP: 98.65%
	ADV: 89.68%
	AUX: 84.05%
	CCONJ: 93.92%
	DET: 84.07%
	NOUN: 95.04%
	NUM: 82.71%
	PART: 83.65%
	PRON: 90.91%
	PROPN: 96.85%
	PUNCT: 100.00%
	SCONJ: 94.95%
	VERB: 95.76%
	X: 69.57%



In [14]:
get_measures(xpos_standard_list, xpos_predictions_list, labels=labels_xpos_list)

MEASURES:
Accuracy: 89.21%
Precision (weighted): 89.55%
Recall (weighted): 89.21%
F1 (weighted): 88.79%
Matthew's Correlation Coefficient: 88.86%


In [15]:
full_upos = get_full_table(upos_standard_list, upos_predictions_list, upos_token_list)
full_upos.to_excel('../data/results/bert_UPOS.xlsx')

In [16]:
comparison_upos = get_comparison(upos_standard_list, upos_predictions_list, upos_token_list)
comparison_upos.to_excel('../data/mistakes/bert_UPOS_mistakes.xlsx')

In [17]:
full_xpos = get_full_table(xpos_standard_list, xpos_predictions_list, xpos_token_list)
full_xpos.to_excel('../data/results/bert_XPOS.xlsx')

In [18]:
comparison_xpos = get_comparison(xpos_standard_list, xpos_predictions_list, xpos_token_list)
comparison_xpos.to_excel('../data/mistakes/bert_XPOS_mistakes.xlsx')

In [19]:
comparison_upos

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,Komornika,Jana Komornika ziemi,NOUN,PROPN
1,Dobrzyńskiej,ziemi Dobrzyńskiej za,PROPN,ADJ
2,ośm,za ośm tysięcy,NUM,X
3,jedną,i jedną ciotkę,NUM,ADJ
4,Ciotka,– Ciotka za,NOUN,PROPN
5,było,Melchiora było 17,AUX,VERB
6,było,Antonich było dwóch,AUX,VERB
7,Niewiem,. Niewiem o,VERB,INTJ
8,ale,– ale mi,CCONJ,PART
9,jego,Dziad jego a,DET,PRON


In [20]:
comparison_xpos

Unnamed: 0,Token,Context,Gold Standard,Prediction
0,parafii,Godziszewo parafii Rypnin,subst:sg:loc:f,subst:sg:gen:f
1,starszym,był starszym bratem,adj:sg:inst:m1:com,adj:sg:inst:m1:pos
2,p,. p .,brev:npun,brev:pun
3,Pinińskich,z Pinińskich właścicieli,subst:pl:gen:m1,adj:pl:gen:m1:pos
4,Dóbr,właścicieli Dóbr Strużewo,subst:pl:gen:n:pt,subst:pl:gen:m3
5,Ronantowizna,", Ronantowizna ,",subst:sg:nom:f,subst:sg:gen:f
6,Żółtowizna,", Żółtowizna ,",subst:sg:nom:f,subst:sg:gen:n:ncol
7,Będowszczyazna,", Będowszczyazna ,",subst:sg:nom:f,subst:sg:gen:n:ncol
8,Bęklowizna,", Bęklowizna ,",subst:sg:nom:f,subst:sg:gen:m3
9,Ruszkowizna,", Ruszkowizna .",subst:sg:nom:f,subst:sg:gen:n:ncol
