# BERT EVALUATION

### IMPORTS, VARIABLES

In [1]:
import pandas as pd

In [2]:
lemma_errors = pd.read_excel('../data/results/all_lemmas_annotated.xlsx')
xpos_errors = pd.read_excel('../data/results/all_xpos_annotated.xlsx')
upos_errors = pd.read_excel('../data/results/all_upos_annotated.xlsx')

### FUNCTIONS

In [3]:
def get_error_stats(errors: list):
    '''A function intended for creating and displaying statistics for manually annotated errors.
    
    Args:
        errors (DataFrame): a Pandas DataFrame created by reading from an .xlsx file containing manually annotated errors.
        
    Returns:
        A DataFrane containing the raw and relative frequencies for every error type found in the annotation.    
    '''
    stats = []
    for error in set(errors['Error Type']):
        raw = errors['Error Type'].value_counts()[error]
        relative = raw / len(errors)
        
        stats.append([error, raw, relative])
        
    stats_pd = pd.DataFrame(
        stats, 
        columns=['error', 'raw', 'relative']
    ).sort_values('relative', ascending=False).set_index('error')

    return stats_pd

### EXECUTION 

In [4]:
get_error_stats(lemma_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
y,35,0.241379
proper name,30,0.206897
nie,19,0.131034
spelling,12,0.082759
surname,12,0.082759
capitalization,8,0.055172
abbreviation,8,0.055172
e,7,0.048276
name,3,0.02069
unidentified,3,0.02069


In [5]:
get_error_stats(xpos_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
ambiguous,198,0.380038
unidentified,65,0.12476
proper name,52,0.099808
y,39,0.074856
digits,25,0.047985
problematic,22,0.042226
nie,20,0.038388
spelling,18,0.034549
archaic,17,0.03263
foreign,15,0.028791


In [6]:
get_error_stats(upos_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
ambiguous,208,0.218029
capitalization,199,0.208595
y,109,0.114256
unidentified,62,0.06499
archaic,59,0.061845
UD,58,0.060797
e,41,0.042977
surname,41,0.042977
nie,28,0.02935
ending,24,0.025157
