# ERROR STATISTICS

### IMPORTS, VARIABLES

In [1]:
import pandas as pd

In [2]:
xpos_errors = pd.read_excel('../data/results/new_stanza_XPOS_mistakes_annotated.xlsx')
upos_errors = pd.read_excel('../data/results/new_stanza_UPOS_mistakes_annotated.xlsx')

In [3]:
xpos_errors_general = pd.read_excel('../data/results/new_stanza_XPOS_mistakes_annotated_generalized.xlsx')
upos_errors_general = pd.read_excel('../data/results/new_stanza_UPOS_mistakes_annotated_generalized.xlsx')

### FUNCTIONS

In [4]:
def get_error_stats(errors: list):
    '''A function intended for creating and displaying statistics for manually annotated errors.
    
    Args:
        errors (DataFrame): a Pandas DataFrame created by reading from an .xlsx file containing manually annotated errors.
        
    Returns:
        A DataFrane containing the raw and relative frequencies for every error type found in the annotation.    
    '''
    stats = []
    for error in set(errors['Error Type']):
        raw = errors['Error Type'].value_counts()[error]
        relative = "{:.2%}".format(raw / len(errors))
        
        stats.append([error, raw, relative])
        
    stats_pd = pd.DataFrame(
        stats, 
        columns=['error', 'raw', 'relative']
    ).sort_values('raw', ascending=False).set_index('error')

    return stats_pd

### EXECUTION - DETAILED

In [5]:
get_error_stats(xpos_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
ambiguous,148,37.85%
unidentified,77,19.69%
proper name,46,11.76%
y,27,6.91%
digits,18,4.60%
foreign,13,3.32%
problematic,12,3.07%
spelling,11,2.81%
archaic,9,2.30%
nie,9,2.30%


In [6]:
get_error_stats(upos_errors)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
capitalization,185,30.73%
ambiguous,138,22.92%
y,58,9.63%
UD,55,9.14%
unidentified,37,6.15%
spelling,28,4.65%
archaic,17,2.82%
foreign,17,2.82%
problematic,16,2.66%
digits,14,2.33%


### EXECUTION - GENERAL

In [7]:
get_error_stats(upos_errors_general)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
spelling,293,48.67%
ambiguous,223,37.04%
unidentified,37,6.15%
vocabulary,35,5.81%
name,7,1.16%
abbreviation,4,0.66%
grammar,3,0.50%


In [8]:
get_error_stats(xpos_errors_general)

Unnamed: 0_level_0,raw,relative
error,Unnamed: 1_level_1,Unnamed: 2_level_1
ambiguous,184,47.06%
unidentified,77,19.69%
spelling,51,13.04%
name,49,12.53%
vocabulary,22,5.63%
grammar,4,1.02%
abbreviation,4,1.02%
