# ERROR STATISTICS

### IMPORTS, VARIABLES

In [1]:
import pandas as pd

In [2]:
lemma_errors = pd.read_excel('../data/results/all_lemmas_annotated.xlsx')
lowercase_lemma_errors = pd.read_excel('../data/results/all_lowercase_lemmas_annotated.xlsx')
xpos_errors = pd.read_excel('../data/results/all_xpos_annotated.xlsx')
upos_errors = pd.read_excel('../data/results/all_upos_annotated.xlsx')

In [3]:
lemma_errors_general = pd.read_excel('../data/results/all_lemmas_annotated_generalized.xlsx')
lowercase_lemma_errors_general = pd.read_excel('../data/results/all_lowercase_lemmas_annotated_generalized.xlsx')
xpos_errors_general = pd.read_excel('../data/results/all_xpos_annotated_generalized.xlsx')
upos_errors_general = pd.read_excel('../data/results/all_upos_annotated_generalized.xlsx')

### FUNCTIONS

In [4]:
def get_error_stats(errors: list):
    '''A function intended for creating and displaying statistics for manually annotated errors.
    
    Args:
        errors (DataFrame): a Pandas DataFrame created by reading from an .xlsx file containing manually annotated errors.
        
    Returns:
        A DataFrane containing the raw and relative frequencies for every error type found in the annotation.    
    '''
    stats = []
    for error in set(errors['Error Type']):
        raw = errors['Error Type'].value_counts()[error]
        relative = "{:.2%}".format(raw / len(errors))
        
        stats.append([error, raw, relative])
        
    stats_pd = pd.DataFrame(
        stats, 
        columns=['error', 'raw', 'relative']
    ).sort_values('raw', ascending=False).set_index('error')

    return stats_pd

### EXECUTION - DETAILED

In [5]:
get_error_stats(lemma_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
y,39,26.17%
proper name,30,20.13%
nie,19,12.75%
surname,12,8.05%
spelling,12,8.05%
abbreviation,8,5.37%
capitalization,8,5.37%
e,7,4.70%
ambiguous,3,2.01%
name,3,2.01%


In [6]:
get_error_stats(lowercase_lemma_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
y,38,32.20%
proper name,25,21.19%
nie,18,15.25%
spelling,12,10.17%
abbreviation,8,6.78%
e,7,5.93%
ambiguous,3,2.54%
unidentified,3,2.54%
problematic,2,1.69%
surname,1,0.85%


In [7]:
get_error_stats(xpos_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
ambiguous,199,38.20%
unidentified,65,12.48%
proper name,52,9.98%
y,39,7.49%
digits,25,4.80%
problematic,22,4.22%
nie,20,3.84%
spelling,18,3.45%
archaic,17,3.26%
foreign,16,3.07%


In [8]:
get_error_stats(upos_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
ambiguous,208,21.80%
capitalization,199,20.86%
y,109,11.43%
unidentified,63,6.60%
UD,58,6.08%
archaic,53,5.56%
surname,41,4.30%
e,41,4.30%
nie,28,2.94%
spelling,27,2.83%


### EXECUTION - GENERAL

In [9]:
get_error_stats(lemma_errors_general)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
spelling,85,57.05%
name,45,30.20%
abbreviation,8,5.37%
ambiguous,5,3.36%
unidentified,3,2.01%
vocabulary,2,1.34%
grammar,1,0.67%


In [10]:
get_error_stats(lowercase_lemma_errors_general)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
spelling,75,63.56%
name,26,22.03%
abbreviation,8,6.78%
ambiguous,5,4.24%
unidentified,3,2.54%
grammar,1,0.85%


In [11]:
get_error_stats(upos_errors_general)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
spelling,404,42.35%
ambiguous,327,34.28%
vocabulary,79,8.28%
name,64,6.71%
unidentified,63,6.60%
abbreviation,11,1.15%
grammar,6,0.63%


In [12]:
get_error_stats(xpos_errors_general)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
ambiguous,254,48.75%
spelling,84,16.12%
name,66,12.67%
unidentified,65,12.48%
vocabulary,43,8.25%
grammar,7,1.34%
abbreviation,2,0.38%
