## Inspect Error Tag Distributions of GEC Datasets

In [1]:
import os

### BEA19

In [2]:
DATA_DIR = "datasets_train/wi+locness/m2/"
tag_list = ["ADJ","ADJ:FORM","ADV","CONJ","CONTR","DET","K","MORPH","NOUN","NOUN:INFL","NOUN:NUM","NOUN:POSS","ORTH","OTHER","PART","PREP","PRON","PUNCT","SPELL","VERB","VERB:FORM","VERB:INFL","VERB:SVA","VERB:TENSE","WO"]
tag_count_dict = dict.fromkeys(tag_list, 0)
tag_count_dict["noop"] = 0
print(tag_count_dict)

{'ADJ': 0, 'ADJ:FORM': 0, 'ADV': 0, 'CONJ': 0, 'CONTR': 0, 'DET': 0, 'K': 0, 'MORPH': 0, 'NOUN': 0, 'NOUN:INFL': 0, 'NOUN:NUM': 0, 'NOUN:POSS': 0, 'ORTH': 0, 'OTHER': 0, 'PART': 0, 'PREP': 0, 'PRON': 0, 'PUNCT': 0, 'SPELL': 0, 'VERB': 0, 'VERB:FORM': 0, 'VERB:INFL': 0, 'VERB:SVA': 0, 'VERB:TENSE': 0, 'WO': 0, 'noop': 0}


In [3]:
def parse_error_tags(filepath):
    with open(filepath, "r") as f:
        for line in f:
            if line.startswith('A'):
                error_tag = line.split('|||')[1]
                if error_tag != "noop":
                    tag_count_dict[error_tag[2:]] += 1
                else:
                    tag_count_dict["noop"] += 1

In [4]:
for file in os.listdir(DATA_DIR):
    filepath = os.path.join(DATA_DIR, file)
    parse_error_tags(filepath)
print(tag_count_dict)

{'ADJ': 2164, 'ADJ:FORM': 338, 'ADV': 2152, 'CONJ': 736, 'CONTR': 440, 'DET': 15922, 'K': 3640, 'MORPH': 2674, 'NOUN': 6208, 'NOUN:INFL': 176, 'NOUN:NUM': 5662, 'NOUN:POSS': 892, 'ORTH': 6782, 'OTHER': 18214, 'PART': 1194, 'PREP': 13946, 'PRON': 3712, 'PUNCT': 24814, 'SPELL': 5542, 'VERB': 8270, 'VERB:FORM': 5008, 'VERB:INFL': 64, 'VERB:SVA': 3136, 'VERB:TENSE': 8672, 'WO': 2272, 'noop': 25582}


In [5]:
# calculate probability of each tag
tag_prob_dict = dict.fromkeys(tag_list, 0.0)
tag_prob_dict["noop"] = 0.0
tag_count_sum = sum(tag_count_dict.values())
for tag in tag_count_dict.keys():
    tag_prob_dict[tag] = tag_count_dict[tag] / tag_count_sum
print(tag_prob_dict)


{'ADJ': 0.012864718331629135, 'ADJ:FORM': 0.0020093691294319075, 'ADV': 0.01279337978265522, 'CONJ': 0.004375431003733384, 'CONTR': 0.0026157467957101752, 'DET': 0.09465436473022139, 'K': 0.02163935985542054, 'MORPH': 0.015896606663020472, 'NOUN': 0.03690580933583811, 'NOUN:INFL': 0.00104629871828407, 'NOUN:NUM': 0.03365990535752503, 'NOUN:POSS': 0.005302832140394264, 'ORTH': 0.04031816992842366, 'OTHER': 0.10828002758423894, 'PART': 0.00709818562290443, 'PREP': 0.08290728366585023, 'PRON': 0.022067391149264023, 'PUNCT': 0.14751622951989157, 'SPELL': 0.03294651986778589, 'VERB': 0.049164150001188976, 'VERB:FORM': 0.029771954438446723, 'VERB:INFL': 0.0003804722611942073, 'VERB:SVA': 0.01864314079851616, 'VERB:TENSE': 0.05155399139181509, 'WO': 0.01350676527239436, 'noop': 0.15208189665422206}


In [6]:
# Make probability of noop much lower since
# 1) Empirically it hurts recall a lot more than boost in precision
# 2) We preprocess out very short sentences, which are mostly noop, so the current probability is inflated
tag_prob_dict["noop"] = 0.03
# Re-normalize the distribution
tag_prob_sum = sum(tag_prob_dict.values())
for tag in tag_prob_dict.keys():
    tag_prob_dict[tag] /= tag_prob_sum
print(tag_prob_dict)

{'ADJ': 0.014653665623936019, 'ADJ:FORM': 0.0022887888081748495, 'ADV': 0.01457240684968129, 'CONJ': 0.004983871487623341, 'CONTR': 0.002979488389340041, 'DET': 0.10781685030698211, 'K': 0.024648494857267608, 'MORPH': 0.018107163529761973, 'NOUN': 0.042037872547779485, 'NOUN:INFL': 0.0011917953557360162, 'NOUN:NUM': 0.03834059831918934, 'NOUN:POSS': 0.00604023555293481, 'ORTH': 0.045924750582964, 'OTHER': 0.12333727618963523, 'PART': 0.008085248038345474, 'PREP': 0.09443623881303684, 'PRON': 0.02513604750279598, 'PUNCT': 0.16802960202973585, 'SPELL': 0.03752801057664206, 'VERB': 0.056000838590550314, 'VERB:FORM': 0.033911995122306644, 'VERB:INFL': 0.00043338012935855135, 'VERB:SVA': 0.021235626338569017, 'VERB:TENSE': 0.05872300752808371, 'WO': 0.015384994592228574, 'noop': 0.034171752337340926}


### Check Error Tag Distribution of C4_200M After Filtering

In [7]:
from collections import defaultdict

In [8]:
def pretty_print_dist(d, target, decimals=4):
    total_error = 0
    for key, value in d.items():
        total_error += abs(target[key]-value)
        print(f"{key}:\t{value:.{decimals}f}\t{(target[key]-value):.{decimals}f}")
    print(f"Total error: {total_error:.{decimals}f}")

def get_distribution(tags, target, decimals=4):
    dist = defaultdict(int)
    for _, tag_set in tags:
        for tag in tag_set:
            dist[tag] += 1
    total_tags = sum(dist.values())
    dist = {tag: count / total_tags for tag, count in dist.items()}
    pretty_print_dist(dist, target, decimals)
    return dist

In [9]:
target_dist = { 'ADJ': 0.012864718331629135, 
                'ADJ:FORM': 0.0020093691294319075, 
                'ADV': 0.01279337978265522, 
                'CONJ': 0.004375431003733384, 
                'CONTR': 0.0026157467957101752, 
                'DET': 0.09465436473022139, 
                'K': 0.02163935985542054, 
                'MORPH': 0.015896606663020472, 
                'NOUN': 0.03690580933583811, 
                'NOUN:INFL': 0.00104629871828407, 
                'NOUN:NUM': 0.03365990535752503, 
                'NOUN:POSS': 0.005302832140394264, 
                'ORTH': 0.04031816992842366, 
                'OTHER': 0.10828002758423894, 
                'PART': 0.00709818562290443, 
                'PREP': 0.08290728366585023, 
                'PRON': 0.022067391149264023, 
                'PUNCT': 0.14751622951989157, 
                'SPELL': 0.03294651986778589, 
                'VERB': 0.049164150001188976, 
                'VERB:FORM': 0.029771954438446723, 
                'VERB:INFL': 0.0003804722611942073, 
                'VERB:SVA': 0.01864314079851616, 
                'VERB:TENSE': 0.05155399139181509, 
                'WO': 0.01350676527239436, 
                'SPACE': 0.0,
                'noop': 0.15208189665422206
                }

In [10]:
tag_path = "/media/tailen/My Passport/c4200m/val_12M_incorrectly_filtered_tags.tsv"
with open(tag_path, "r") as f:
    lines = f.readlines()
tags = [line.strip().split("\t") for line in lines]
tags = [["noop"] if len(tag_set)==1 and tag_set[0]=="" else tag_set for tag_set in tags]
tags = list(enumerate(tags))
get_distribution(tags, target_dist)

noop:	0.0887	0.0634
NOUN:	0.0711	-0.0342
OTHER:	0.1871	-0.0788
PUNCT:	0.1235	0.0241
PREP:	0.0834	-0.0005
ADV:	0.0153	-0.0025
VERB:TENSE:	0.0331	0.0185
NOUN:NUM:	0.0374	-0.0037
ORTH:	0.0543	-0.0140
VERB:	0.0505	-0.0014
SPELL:	0.0615	-0.0285
NOUN:POSS:	0.0071	-0.0018
DET:	0.0710	0.0237
MORPH:	0.0209	-0.0051
PRON:	0.0154	0.0067
CONJ:	0.0068	-0.0025
WO:	0.0092	0.0043
ADJ:	0.0137	-0.0008
NOUN:INFL:	0.0033	-0.0022
VERB:FORM:	0.0237	0.0061
PART:	0.0057	0.0014
VERB:SVA:	0.0122	0.0064
VERB:INFL:	0.0010	-0.0006
ADJ:FORM:	0.0010	0.0010
CONTR:	0.0023	0.0003
SPACE:	0.0006	-0.0006
K:	0.0003	0.0213
Total error: 0.3544


{'noop': 0.08869236428068565,
 'NOUN': 0.07113804095903306,
 'OTHER': 0.1871121693205399,
 'PUNCT': 0.12346298156658941,
 'PREP': 0.0833914733281126,
 'ADV': 0.01526511046599008,
 'VERB:TENSE': 0.0331015643745534,
 'NOUN:NUM': 0.03740840643679616,
 'ORTH': 0.054333077570090996,
 'VERB': 0.05052299627956768,
 'SPELL': 0.06147810060196648,
 'NOUN:POSS': 0.007059592835465133,
 'DET': 0.07096928995377805,
 'MORPH': 0.020948855261735445,
 'PRON': 0.015354759437531805,
 'CONJ': 0.00683810714106793,
 'WO': 0.009156851422650077,
 'ADJ': 0.013666194691198844,
 'NOUN:INFL': 0.0032753515426214943,
 'VERB:FORM': 0.023658363589861228,
 'PART': 0.0057138035685564146,
 'VERB:SVA': 0.012234975227879776,
 'VERB:INFL': 0.0009961582778959914,
 'ADJ:FORM': 0.0009845566462847093,
 'CONTR': 0.0023103067313193955,
 'SPACE': 0.0006280701476834969,
 'K': 0.00029847834054480207}

# Wrong Distribution!!!

### Check Distribution After Tag Extraction Bug Fix

In [11]:
target_dist = { 'ADJ': 0.014653665623936019, 
                'ADJ:FORM': 0.0022887888081748495, 
                'ADV': 0.01457240684968129, 
                'CONJ': 0.004983871487623341, 
                'CONTR': 0.002979488389340041, 
                'DET': 0.10781685030698211, 
                'K': 0.024648494857267608, 
                'MORPH': 0.018107163529761973, 
                'NOUN': 0.042037872547779485, 
                'NOUN:INFL': 0.0011917953557360162, 
                'NOUN:NUM': 0.03834059831918934, 
                'NOUN:POSS': 0.00604023555293481, 
                'ORTH': 0.045924750582964, 
                'OTHER': 0.12333727618963523, 
                'PART': 0.008085248038345474, 
                'PREP': 0.09443623881303684, 
                'PRON': 0.02513604750279598, 
                'PUNCT': 0.16802960202973585, 
                'SPELL': 0.03752801057664206, 
                'VERB': 0.056000838590550314, 
                'VERB:FORM': 0.033911995122306644, 
                'VERB:INFL': 0.00043338012935855135, 
                'VERB:SVA': 0.021235626338569017, 
                'VERB:TENSE': 0.05872300752808371, 
                'WO': 0.015384994592228574, 
                'SPACE': 0.0, 
                'noop': 0.034171752337340926
                }

In [12]:
tag_path = "/media/tailen/My Passport/c4200m/val_10M_filtered_tags.tsv"
with open(tag_path, "r") as f:
    lines = f.readlines()
tags = [line.strip().split("\t") for line in lines]
tags = [["noop"] if len(tag_set)==1 and tag_set[0]=="" else tag_set for tag_set in tags]
tags = list(enumerate(tags))
get_distribution(tags, target_dist)

PUNCT:	0.1615	0.0065
VERB:TENSE:	0.0590	-0.0002
SPELL:	0.0414	-0.0039
PREP:	0.1016	-0.0072
DET:	0.1134	-0.0056
VERB:	0.0595	-0.0035
WO:	0.0151	0.0003
ORTH:	0.0451	0.0008
VERB:FORM:	0.0376	-0.0037
NOUN:	0.0480	-0.0060
OTHER:	0.1259	-0.0026
NOUN:NUM:	0.0434	-0.0050
VERB:SVA:	0.0207	0.0005
PART:	0.0091	-0.0010
MORPH:	0.0228	-0.0047
CONTR:	0.0037	-0.0007
ADV:	0.0180	-0.0035
PRON:	0.0261	-0.0009
ADJ:	0.0179	-0.0033
NOUN:POSS:	0.0096	-0.0036
NOUN:INFL:	0.0045	-0.0033
CONJ:	0.0087	-0.0037
noop:	0.0026	0.0315
ADJ:FORM:	0.0016	0.0007
VERB:INFL:	0.0016	-0.0011
K:	0.0005	0.0241
SPACE:	0.0009	-0.0009
Total error: 0.1288


{'PUNCT': 0.16154378217133689,
 'VERB:TENSE': 0.058956627148458715,
 'SPELL': 0.04141206630557498,
 'PREP': 0.10164202832652858,
 'DET': 0.11344106523772288,
 'VERB': 0.05952182231712151,
 'WO': 0.015112622895858793,
 'ORTH': 0.045147432711446236,
 'VERB:FORM': 0.03762826803416939,
 'NOUN': 0.04801196596227299,
 'OTHER': 0.12592708229984592,
 'NOUN:NUM': 0.04338460684845387,
 'VERB:SVA': 0.02074482566644363,
 'PART': 0.009112714117042422,
 'MORPH': 0.022769089560864042,
 'CONTR': 0.003727843009283401,
 'ADV': 0.01804439650368952,
 'PRON': 0.026081735120955057,
 'ADJ': 0.017905213666814156,
 'NOUN:POSS': 0.00960173489525316,
 'NOUN:INFL': 0.00446607629946692,
 'CONJ': 0.008673065628939497,
 'noop': 0.0026350696548970965,
 'ADJ:FORM': 0.0015653367025611053,
 'VERB:INFL': 0.0015700388254285163,
 'K': 0.0005224058505693566,
 'SPACE': 0.0008510842390013819}