# ERROR STATISTICS

### IMPORTS, VARIABLES

In [1]:
import pandas as pd

In [2]:
lemma_errors = pd.read_excel('../data/mistakes/annotated/stanza_lemma_mistakes_annotated.xlsx')
lowercase_lemma_errors = pd.read_excel('../data/mistakes/annotated/stanza_lowercase_lemma_mistakes_annotated.xlsx')
xpos_errors = pd.read_excel('../data/mistakes/annotated/stanza_XPOS_mistakes_annotated.xlsx')
upos_errors = pd.read_excel('../data/mistakes/annotated/stanza_UPOS_mistakes_annotated.xlsx')

### FUNCTIONS

In [3]:
def get_error_stats(errors: list):
    '''A function intended for creating and displaying statistics for manually annotated errors.
    
    Args:
        errors (DataFrame): a Pandas DataFrame created by reading from an .xlsx file containing manually annotated errors.
        
    Returns:
        A DataFrane containing the raw and relative frequencies for every error type found in the annotation.    
    '''
    stats = []
    for error in set(errors['Error Type']):
        raw = errors['Error Type'].value_counts()[error]
        relative = "{:.2%}".format(raw / len(errors))
        
        stats.append([error, raw, relative])
        
    stats_pd = pd.DataFrame(
        stats, 
        columns=['error', 'raw', 'relative']
    ).sort_values('raw', ascending=False).set_index('error')

    return stats_pd

### EXECUTION - DETAILED

In [4]:
get_error_stats(lemma_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
unidentified,210,39.11%
stanza,97,18.06%
spelling,95,17.69%
name,61,11.36%
ambiguous,37,6.89%
vocabulary,20,3.72%
grammar,9,1.68%
abbreviation,8,1.49%


In [5]:
get_error_stats(lowercase_lemma_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
unidentified,210,47.73%
spelling,95,21.59%
name,61,13.86%
ambiguous,37,8.41%
vocabulary,20,4.55%
grammar,9,2.05%
abbreviation,8,1.82%


In [6]:
get_error_stats(xpos_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
ambiguous,199,49.50%
spelling,61,15.17%
name,55,13.68%
unidentified,54,13.43%
vocabulary,21,5.22%
abbreviation,5,1.24%
annotation,4,1.00%
grammar,3,0.75%


In [7]:
get_error_stats(upos_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
spelling,301,43.75%
ambiguous,244,35.47%
name,55,7.99%
unidentified,52,7.56%
vocabulary,29,4.22%
abbreviation,6,0.87%
grammar,1,0.15%
