In [1]:
!pip install language_tool_python --upgrade

Requirement already up-to-date: language_tool_python in /u/jm8wx/.conda/envs/torch/lib/python3.7/site-packages (2.2.1)


In [2]:
import language_tool_python
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

  from pandas import Panel


In [3]:
data_path = '../case_study_examples.csv'
df = pd.read_csv(data_path)

In [4]:
df.head()

Unnamed: 0,USE_similarity,dataset,original_class,original_text,perturbed_class,perturbed_text,run_type
0,0.930254,movie_review,0,davis is so enamored of her own creation that ...,1,davis is even enamored of her own creation tha...,jin
1,0.808168,movie_review,0,"somewhere in the middle, the film compels, as ...",1,"somewhere in the eastwards, the film commits, ...",jin
2,0.930277,movie_review,1,"a muckraking job, the cinematic equivalent of ...",0,"a muckraking job, the cinematic equivalent of ...",jin
3,0.951098,movie_review,0,it's hard to understand why anyone in his righ...,1,it's intense to understand why anyone in his r...,jin
4,0.912998,movie_review,0,never having seen the first two films in the s...,1,never having seen the first two cinema in the ...,jin


In [5]:
lang_tool = language_tool_python.LanguageTool("en-US")

100%|██████████| 217M/217M [00:14<00:00, 15.2MB/s] 


Unzipping /tmp/tmp9wm08sns.zip to /u/jm8wx/.cache/language_tool_python/.
Downloaded https://www.languagetool.org/download/LanguageTool-4.9.zip to /u/jm8wx/.cache/language_tool_python/.


In [6]:
def get_errors(text):
    return lang_tool.check(text)

def get_error_count(text):
    return len(get_errors(text))

In [7]:
df['original_text_errors'] = df['original_text'].progress_map(get_errors)
df['perturbed_text_errors'] = df['perturbed_text'].progress_map(get_errors)

100%|██████████| 3115/3115 [16:46<00:00,  3.09it/s]
100%|██████████| 3115/3115 [16:48<00:00,  3.09it/s]


In [8]:
df['original_text_error_count'] = df['original_text_errors'].map(len)
df['perturbed_text_error_count'] = df['perturbed_text_errors'].map(len)

In [9]:
original_text_errors = [e for l in df['original_text_errors'] for e in l]
perturbed_text_errors = [e for l in df['perturbed_text_errors'] for e in l]
print('Original errors:', len(original_text_errors))
print('Perturbed errors:', len(perturbed_text_errors))

Original errors: 49895
Perturbed errors: 52832


In [10]:
print('Added errors:', (df['original_text_error_count'] < df['perturbed_text_error_count']).sum())
print('Same # errors:', (df['original_text_error_count'] == df['perturbed_text_error_count']).sum())
print('Removed errors:', (df['original_text_error_count'] > df['perturbed_text_error_count']).sum())

Added errors: 1396
Same # errors: 1508
Removed errors: 211


In [11]:
df['run'] = df['dataset'] + '_' + df['run_type']
print(df['run'].unique())

['movie_review_jin' 'yelp_polarity_jin' 'imdb_jin' 'imdb_alzantot']


In [12]:
original_text_error_rule_ids = pd.Series([x.ruleId for x in original_text_errors])
perturbed_text_error_rule_ids = pd.Series([x.ruleId for x in perturbed_text_errors])

original_text_error_rule_freqs = original_text_error_rule_ids.value_counts().to_dict()
perturbed_text_error_rule_freqs = perturbed_text_error_rule_ids.value_counts().to_dict()

print('Original errors:', original_text_error_rule_ids.value_counts())
print()
print('Perturbed errors:', perturbed_text_error_rule_ids.value_counts())

Original errors: MORFOLOGIK_RULE_EN_US           20147
UPPERCASE_SENTENCE_START         9368
I_LOWERCASE                      9041
COMMA_PARENTHESIS_WHITESPACE     3853
EN_QUOTES                        1818
                                ...  
ENTIRELY_COMPRISED_OF               1
IT_SELF                             1
LESS_COMPARATIVE                    1
DAY_TO_DAY_HYPHEN                   1
BUY_TWO_GET_ONE_FREE                1
Length: 418, dtype: int64

Perturbed errors: MORFOLOGIK_RULE_EN_US           21299
UPPERCASE_SENTENCE_START         9414
I_LOWERCASE                      8734
COMMA_PARENTHESIS_WHITESPACE     3856
EN_QUOTES                        1818
                                ...  
DAY_TO_DAY_HYPHEN                   1
BLIND_FOR_FROM                      1
REGARDING_TO                        1
HE_THE                              1
SHORT_SUPERLATIVES                  1
Length: 456, dtype: int64


In [13]:
err_data = []

for rule_id, pert_freq in perturbed_text_error_rule_freqs.items():
    orig_freq = original_text_error_rule_freqs.get(rule_id, 0)
    freq_diff = pert_freq - orig_freq
    freq_ratio = pert_freq / orig_freq if orig_freq else None
    err_data.append({
        'freq_diff':  freq_diff,
        'orig_freq':  orig_freq,
        'pert_freq':  pert_freq,
        'freq_ratio': freq_ratio,
        'rule_id': rule_id
    })
    
err_data = pd.DataFrame(err_data)
err_data['freq_diff_perc'] = err_data['freq_diff'] / float(len(df))
err_data.head()

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc
0,1152,20147,21299,1.05718,MORFOLOGIK_RULE_EN_US,0.369823
1,46,9368,9414,1.00491,UPPERCASE_SENTENCE_START,0.014767
2,-307,9041,8734,0.966044,I_LOWERCASE,-0.098555
3,3,3853,3856,1.000779,COMMA_PARENTHESIS_WHITESPACE,0.000963
4,0,1818,1818,1.0,EN_QUOTES,0.0


In [14]:
len(df)

3115

In [15]:
pd.set_option('display.max_rows', None)

err_data.sort_values(by='freq_diff', ascending=False)

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc
0,1152,20147,21299,1.05718,MORFOLOGIK_RULE_EN_US,0.369823
5,403,99,502,5.070707,EN_A_VS_AN,0.129374
10,303,25,328,13.12,DID_BASEFORM,0.097271
11,274,20,294,14.7,A_PLURAL,0.087961
18,121,2,123,61.5,TO_NON_BASE,0.038844
21,109,3,112,37.333333,PRP_VBG,0.034992
17,104,20,124,6.2,THIS_NNS,0.033387
27,71,7,78,11.142857,NON3PRS_VERB,0.022793
30,67,6,73,12.166667,PRP_VB,0.021509
24,64,31,95,3.064516,BEEN_PART_AGREEMENT,0.020546


In [16]:
err_data[err_data['freq_ratio'].isna()].sort_values(by='freq_diff', ascending=False)

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc
50,29,0,29,,ARTICLE_ADJECTIVE_OF,0.00931
116,9,0,9,,DONT_NEEDS,0.002889
136,6,0,6,,BE_TO_VBG,0.001926
143,6,0,6,,MENTION_ABOUT,0.001926
156,5,0,5,,WAS_BEEN,0.001605
171,4,0,4,,COMPARISONS_AS_ADJECTIVE_AS,0.001284
180,4,0,4,,IT_ITS,0.001284
228,3,0,3,,SAY_TELL,0.000963
233,3,0,3,,IS_EVEN_WORST,0.000963
225,3,0,3,,PRONOUN_NOUN,0.000963


In [17]:
err_data.sort_values(by=['pert_freq'], ascending=False)

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc
0,1152,20147,21299,1.05718,MORFOLOGIK_RULE_EN_US,0.369823
1,46,9368,9414,1.00491,UPPERCASE_SENTENCE_START,0.014767
2,-307,9041,8734,0.966044,I_LOWERCASE,-0.098555
3,3,3853,3856,1.000779,COMMA_PARENTHESIS_WHITESPACE,0.000963
4,0,1818,1818,1.0,EN_QUOTES,0.0
5,403,99,502,5.070707,EN_A_VS_AN,0.129374
6,-17,450,433,0.962222,COMMA_COMPOUND_SENTENCE,-0.005457
7,0,428,428,1.0,PUNCTUATION_PARAGRAPH_END,0.0
8,-65,491,426,0.867617,EN_SPECIFIC_CASE,-0.020867
9,-91,508,417,0.820866,EN_COMPOUNDS,-0.029213


In [18]:
len(df)

3115

In [19]:
err_data.sort_values(by=['freq_ratio', 'freq_diff'], ascending=False)

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc
18,121,2,123,61.5,TO_NON_BASE,0.038844
21,109,3,112,37.333333,PRP_VBG,0.034992
11,274,20,294,14.7,A_PLURAL,0.087961
10,303,25,328,13.12,DID_BASEFORM,0.097271
30,67,6,73,12.166667,PRP_VB,0.021509
57,22,2,24,12.0,PRP_PAST_PART,0.007063
39,42,4,46,11.5,PRP_MD_NN,0.013483
27,71,7,78,11.142857,NON3PRS_VERB,0.022793
110,8,1,9,9.0,COULDVE_IRREGULAR_VERB,0.002568
40,39,5,44,8.8,SUPERLATIVE_THAN,0.01252


In [20]:
perturbed_text_errors[0]

Match({'ruleId': 'UPPERCASE_SENTENCE_START', 'message': 'This sentence does not start with an uppercase letter', 'replacements': ['Davis'], 'context': 'davis is even enamored of her own creation th...', 'offset': 0, 'errorLength': 5, 'category': 'CASING', 'ruleIssueType': 'typographical'})

In [21]:
for err in perturbed_text_errors:
    if err.ruleId == 'MORFOLOGIK_RULE_EN_US':
        badword = err.context[err.offset:err.offset+err.errorLength]
        goodwords = err.replacements
        print(err.message + ':', 'Replace',badword,'with one of',goodwords)
        print(err.category)
        print()
        break

Possible spelling mistake found.: Replace exper with one of ['lemme']
TYPOS



In [31]:
def yield_errs(err_code, num_to_print=1):
    n = 0
    for err in perturbed_text_errors:
        if n >= num_to_print: 
            break
        if err.ruleId == err_code: 
            print('err:', err)
            badword = err.context[err.offset:err.offset+err.errorLength]
            goodwords = err.replacements
            err_str = err_code + ': ' + err.message + ' || Replace '+badword+' with one of ' + '[' +','.join(goodwords) + ']'
            context = err.context
            cat = err.category
            yield err_str
            n += 1

def get_err(err_code):
    ans = list(yield_errs(err_code, num_to_print=1))
    return ans[0] if len(ans) else None

def get_errs(*args):
    return list(yield_errs(*args))

def print_errs(*args):
    print(get_errs(*args))

print_errs('I_AM')

[]


In [32]:
def yield_errs_context(err_code, num_to_print=1):
    n = 0
    for err in perturbed_text_errors:
        if n >= num_to_print: 
            break
        if err.ruleId == err_code: 
            badword = err.context[err.offset:err.offset+err.errorLength]
            goodwords = err.replacements
            err_str = err_code + ': ' + err.message + ' || Replace '+badword+' with one of ' + '[' +','.join(goodwords) + ']'
            context = err.context
            cat = err.category
            yield context
            n += 1

def get_err_context(err_code):
    ans = list(yield_errs_context(err_code, num_to_print=1))
    return ans[0] if len(ans) else None

def get_errs_context(*args):
    return list(yield_errs_context(*args))

def print_errs_context(*args):
    print(get_errs_context(*args))

print_errs('I_AM')

[]


In [33]:
get_err('DID_BASEFORM')

err: Offset 62, length 8, Rule ID: DID_BASEFORM
Message: The verb 'can't' requires base form of this verb: "compare"
Suggestion: compare
...first two cinema in the series, i can't compares friday after next to them, but nothing ...
                                                              ^^^^^^^^


'DID_BASEFORM: The verb \'can\'t\' requires base form of this verb: "compare" || Replace er next  with one of [compare]'

In [34]:
get_errs('DID_BASEFORM', 20)

err: Offset 62, length 8, Rule ID: DID_BASEFORM
Message: The verb 'can't' requires base form of this verb: "compare"
Suggestion: compare
...first two cinema in the series, i can't compares friday after next to them, but nothing ...
                                                              ^^^^^^^^
err: Offset 16, length 6, Rule ID: DID_BASEFORM
Message: The verb 'doesn't' requires base form of this verb: "say"
Suggestion: say
while i doesn't saying it's on par with the first one, stuart ...
                ^^^^^^
err: Offset 60, length 11, Rule ID: DID_BASEFORM
Message: The verb 'cannot' requires the base form of the verb: "contribute"
Suggestion: contribute
...irking there's a part of we that cannot contributes be entertained by the sight of someone ...
                                                            ^^^^^^^^^^^
err: Offset 10, length 8, Rule ID: DID_BASEFORM
Message: The verb 'can't' requires base form of this verb: "repulse"
Suggestion: repulse
one can't repulsed its

['DID_BASEFORM: The verb \'can\'t\' requires base form of this verb: "compare" || Replace er next  with one of [compare]',
 'DID_BASEFORM: The verb \'doesn\'t\' requires base form of this verb: "say" || Replace saying with one of [say]',
 'DID_BASEFORM: The verb \'cannot\' requires the base form of the verb: "contribute" || Replace tertained b with one of [contribute]',
 'DID_BASEFORM: The verb \'can\'t\' requires base form of this verb: "repulse" || Replace repulsed with one of [repulse]',
 'DID_BASEFORM: The verb \'could\' requires the base form of the verb: "become" || Replace d becom with one of [become]',
 'DID_BASEFORM: The verb \'doesn\'t\' requires base form of this verb: "make" || Replace  with one of [make]',
 'DID_BASEFORM: The verb \'ll\' requires the base form of the verb: "take" || Replace  with one of [take]',
 'DID_BASEFORM: The verb \'can\'t\' requires base form of this verb: "behave" || Replace behaving with one of [behave]',
 'DID_BASEFORM: The verb \'will\' requires

In [35]:
pd.options.display.max_colwidth = 500
err_data['Explanation'] = err_data['rule_id'].map(get_err)
err_data['Context'] = err_data['rule_id'].map(get_err_context)
err_data.head()

err: Offset 49, length 5, Rule ID: MORFOLOGIK_RULE_EN_US
Message: Possible spelling mistake found.
Suggestion: lemme
... in the eastwards, the film commits, as demme experimentation he captures a few scene...
                                                 ^^^^^
err: Offset 0, length 5, Rule ID: UPPERCASE_SENTENCE_START
Message: This sentence does not start with an uppercase letter
Suggestion: Davis
davis is even enamored of her own creation th...
^^^^^
err: Offset 54, length 1, Rule ID: I_LOWERCASE
Message: Did you mean "I"?
Suggestion: I
...een the first two cinema in the series, i can't compares friday after next to the...
                                                      ^
err: Offset 10, length 2, Rule ID: COMMA_PARENTHESIS_WHITESPACE
Message: Don't put a space after the opening parenthesis
Suggestion: (
rosenthal ( halloween iii ) sounds to recieve ignore...
          ^^
err: Offset 220, length 1, Rule ID: EN_QUOTES
Message: Use a smart opening quote here: "“".
Suggestion: “

err: Offset 682, length 8, Rule ID: MANY_NN_U
Message: Possible agreement error. The noun may seems to be uncountable; consider using: "much may", "a good deal of may".
Suggestion: much may; a good deal of may
...gs ( flipping cars and head squashing ) many may think it sound like utter crap that onl...
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          ^^^^^^^^
err:

err: Offset 682, length 6, Rule ID: AFFECT_EFFECT
Message: Did you mean "effect"?
Suggestion: effect
...fferent goal. it has an almost dizzying affect on you but at the same time is exciting...
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          ^^^^^^
err: Offset 460, length 8, Rule ID: CA_BUG_EYED
Message: 'bug eyed' seems to be a compound adjective in front of a no

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc,Explanation,Context
0,1152,20147,21299,1.05718,MORFOLOGIK_RULE_EN_US,0.369823,MORFOLOGIK_RULE_EN_US: Possible spelling mistake found. || Replace exper with one of [lemme],"... in the eastwards, the film commits, as demme experimentation he captures a few scene..."
1,46,9368,9414,1.00491,UPPERCASE_SENTENCE_START,0.014767,UPPERCASE_SENTENCE_START: This sentence does not start with an uppercase letter || Replace davis with one of [Davis],davis is even enamored of her own creation th...
2,-307,9041,8734,0.966044,I_LOWERCASE,-0.098555,"I_LOWERCASE: Did you mean ""I""? || Replace p with one of [I]","...een the first two cinema in the series, i can't compares friday after next to the..."
3,3,3853,3856,1.000779,COMMA_PARENTHESIS_WHITESPACE,0.000963,COMMA_PARENTHESIS_WHITESPACE: Don't put a space after the opening parenthesis || Replace ( with one of [(],rosenthal ( halloween iii ) sounds to recieve ignore...
4,0,1818,1818,1.0,EN_QUOTES,0.0,"EN_QUOTES: Use a smart opening quote here: ""“"". || Replace with one of [“]","...anctimonious lloyd c douglas, (he wrote ""the robe""), and already filmed in 1935 w..."


In [36]:
err_data.sort_values(by=['freq_ratio','freq_diff'], ascending=False)

Unnamed: 0,freq_diff,orig_freq,pert_freq,freq_ratio,rule_id,freq_diff_perc,Explanation,Context
18,121,2,123,61.5,TO_NON_BASE,0.038844,"TO_NON_BASE: Did you mean ""know""? || Replace with one of [know]",...ees at person they don't really want to knew
21,109,3,112,37.333333,PRP_VBG,0.034992,"PRP_VBG: Did you mean ""we're wanting"", ""we are wanting"", or ""we were wanting""? || Replace we wanting with one of [we're wanting,we are wanting,we were wanting]",while we wanting macdowell's character to retrieve her h...
11,274,20,294,14.7,A_PLURAL,0.087961,"A_PLURAL: Don't use indefinite articles with plural words. Did you mean ""a grate"", ""a gratis"" or simply ""grates""? || Replace a grates with one of [a grate,a gratis,grates]","a grates, lanky flick"
10,303,25,328,13.12,DID_BASEFORM,0.097271,"DID_BASEFORM: The verb 'can't' requires base form of this verb: ""compare"" || Replace er next with one of [compare]","...first two cinema in the series, i can't compares friday after next to them, but nothing ..."
30,67,6,73,12.166667,PRP_VB,0.021509,"PRP_VB: Do not use a noun immediately after the pronoun 'it'. Use a verb or an adverb, or possibly some other part of speech. || Replace e a rea with one of []","...ble of being gravest, so thick with wry it game like a readings from bartlett's familia..."
57,22,2,24,12.0,PRP_PAST_PART,0.007063,"PRP_PAST_PART: Possible grammatical error. You used a past participle without using any required verb ('be' or 'have'). Did you mean ""underwent""? || Replace undergone with one of [underwent]",e ad i undergone and bruised on the hard territories of ...
39,42,4,46,11.5,PRP_MD_NN,0.013483,"PRP_MD_NN: It seems that a verb or adverb has been misspelled or is missing here. || Replace with one of [can be appreciative,can have appreciative]","...y bit as awful as borchardt's coven, we can appreciative it anyway"
27,71,7,78,11.142857,NON3PRS_VERB,0.022793,"NON3PRS_VERB: The pronoun 'they' must be used with a non-third-person form of a verb: ""do"" || Replace does with one of [do]",they does a ok operating of painting this family ...
110,8,1,9,9.0,COULDVE_IRREGULAR_VERB,0.002568,"COULDVE_IRREGULAR_VERB: Did you mean ""we've sought""? || Replace thing... with one of [we've sought]",...d here each times we going to los vegas we've seek other well known restaurant and nothing...
40,39,5,44,8.8,SUPERLATIVE_THAN,0.01252,"SUPERLATIVE_THAN: Use ""stronger"" to make a comparison with this adjective. || Replace with one of [stronger]",...e and action sequences and it's all the strongest than of it


In [49]:
min_pert_freq = 25
min_pert_ratio = 5.0


explanatory_df = err_data \
    [(err_data['pert_freq'] > min_pert_freq) & (err_data['freq_ratio'] > min_pert_ratio)] \
    .sort_values(by=['freq_ratio','freq_diff'], ascending=False) \
    .reset_index() \
    .drop(['freq_ratio', 'freq_diff', 'index', 'freq_diff_perc'], axis=1) 

explanatory_df = explanatory_df[['rule_id', 'orig_freq', 'pert_freq', 'Explanation', 'Context']]
explanatory_df.head()

Unnamed: 0,rule_id,orig_freq,pert_freq,Explanation,Context
0,TO_NON_BASE,2,123,"TO_NON_BASE: Did you mean ""know""? || Replace with one of [know]",...ees at person they don't really want to knew
1,PRP_VBG,3,112,"PRP_VBG: Did you mean ""we're wanting"", ""we are wanting"", or ""we were wanting""? || Replace we wanting with one of [we're wanting,we are wanting,we were wanting]",while we wanting macdowell's character to retrieve her h...
2,A_PLURAL,20,294,"A_PLURAL: Don't use indefinite articles with plural words. Did you mean ""a grate"", ""a gratis"" or simply ""grates""? || Replace a grates with one of [a grate,a gratis,grates]","a grates, lanky flick"
3,DID_BASEFORM,25,328,"DID_BASEFORM: The verb 'can't' requires base form of this verb: ""compare"" || Replace er next with one of [compare]","...first two cinema in the series, i can't compares friday after next to them, but nothing ..."
4,PRP_VB,6,73,"PRP_VB: Do not use a noun immediately after the pronoun 'it'. Use a verb or an adverb, or possibly some other part of speech. || Replace e a rea with one of []","...ble of being gravest, so thick with wry it game like a readings from bartlett's familia..."


In [50]:
latex = explanatory_df.to_latex()
for line in latex.split('\n'):
    print(line)

\begin{tabular}{llrrll}
\toprule
{} &               rule\_id &  orig\_freq &  pert\_freq &                                                                                                                                                                  Explanation &                                                                                         Context \\
\midrule
0  &           TO\_NON\_BASE &          2 &        123 &                                                                                                             TO\_NON\_BASE: Did you mean "know"? || Replace  with one of [know] &                                                 ...ees at person they don't really want to knew \\
1  &               PRP\_VBG &          3 &        112 &              PRP\_VBG: Did you mean "we're wanting", "we are wanting", or "we were wanting"? || Replace we wanting with one of [we're wanting,we are wanting,we were wanting] &                                     while we wanting macdowel

In [47]:
df['run'].unique()

array(['movie_review_jin', 'yelp_polarity_jin', 'imdb_jin',
       'imdb_alzantot'], dtype=object)

In [40]:
len(df)

3115

In [41]:
df['introduced_error'] = df['original_text_error_count'] < df['perturbed_text_error_count']
df.groupby('run')['introduced_error'].mean()

run
imdb_alzantot        0.319092
imdb_jin             0.528424
movie_review_jin     0.283430
yelp_polarity_jin    0.611726
Name: introduced_error, dtype: float64

In [42]:
df['introduced_error'].mean()

0.4481540930979133