In [203]:
import pandas as pd
import numpy as np
SEED=0

In [204]:
def process_df(file_name):
    df = pd.read_csv(file_name, sep="\t")
    # Filter failed perturbation strategies
    df = df[df['valid?']!='--']
    
    # Sample 50 unique indices
    np.random.seed(SEED)
    indices = np.random.choice(df['idx'].unique(), 50, replace=False)
    df = df[df['idx'].isin(indices)]
    yes_indices = set(df[df['valid?']=='Y']['idx'].unique())
    maybe_indices = set(df[df['valid?']=='M']['idx'].unique())
    num_successful = len(yes_indices.union(maybe_indices))
    print(f"num successful: {num_successful}")
    print(f"validity: {num_successful/50}")
    
    return df

In [205]:
import nltk
nltk.download('stopwords')

def get_valid_sample(df):
    np.random.seed(SEED)
    num_invalid=0
    new_df = pd.DataFrame(columns=df.columns)

    for didx in df['idx'].unique():
        temp_df = df[df['idx']==didx]
        valid_df = temp_df[temp_df['valid?'].isin(["Y", "M"])]
        if len(valid_df) == 0:
            num_invalid += 1
            continue
        row = valid_df.sample()
        new_df = new_df.append(row)
    print(f"{num_invalid}/{len(df['idx'].unique())} invalid")
    
    return new_df

from allennlp.data.tokenizers.spacy_tokenizer import SpacyTokenizer
import difflib
import more_itertools as mit
import numpy as np
from nltk.corpus import stopwords


PREPOSITIONS = ["at", "above", "across", "after", "against", "along", "among", "around", "at", "before",
               "behind", "below", "beneath", "beside", "between", "by", "down", "for","from", "in",
               "into", "near", "of", "off", "on", "to", "toward", "under", "upon", "with", "within"]

def get_edited_spans(orig, edited, tokenizer_wrapper=SpacyTokenizer(), return_type='new'):
    """ Given an orig and edited inputs, mark up differences in HTML. """

    orig_tok = tokenizer_wrapper.tokenize(orig)
    edited_tok = tokenizer_wrapper.tokenize(edited)

    orig_text_tok = [t.text for t in orig_tok]
    edited_text_tok = [t.text for t in edited_tok]

    edited_mark_indices, num_add, num_del = get_marked_indices(orig_text_tok, 
                                                        edited_text_tok, "+")
    orig_mark_indices, num_add_2, num_del_2 = get_marked_indices(orig_text_tok, 
                                                        edited_text_tok, "-")
    if return_type == "orig":
        indices = orig_mark_indices
        tok = orig_tok
        text = orig
    else:
        indices = edited_mark_indices
        tok = edited_tok
        text = edited
    
    grouped_indices = mit.consecutive_groups(indices)

    spans = []
    for group in grouped_indices:
        temp_span = []

        for tok_idx, idx in enumerate(group):
            token = tok[idx]
            start, end = token.idx, token.idx_end
            if start == None or end == None:
                logger.info(token, start, end)
            
            tok_text = text[start:end]
            if tok_idx == 0 and tok_text.lower() in PREPOSITIONS:
                continue
                
            if tok_text in stopwords.words('english'):
                continue

            if any([c.isalnum() for c in tok_text]) and len(tok_text) > 0:
                temp_span.append(tok_text)
        if temp_span == []:
            continue
        spans.append(" ".join(temp_span))

    return spans

def get_marked_indices(orig_tokinal, tokenized_contrast, symbol):
    """ Helper function for html_highlight_diffs. 
    Will only return indices of words deleted or replaced (not inserted). """

    index_offset = 0
    d = difflib.Differ()
    diff = d.compare(orig_tokinal, tokenized_contrast)
    list_diff = list(diff)
    tokens, modified_tokens, indices = [], [], []
    counter = 0
    additions, deletions = 0, 0

    for token_idx, token in enumerate(list_diff):
        marker = token[0]
        word = token[2:]
        if marker == symbol:        
            tokens.append(word)
            indices.append(counter)
            counter += 1
        elif marker == " ":
            modified_tokens.append(word)
            counter += 1

        if marker == "+":
            additions += 1
        if marker == "-":
            deletions += 1
            
    return indices, additions, deletions

def heuristically_pick_span(row, span_key):
    spans = row[span_key]
    if len(spans) == 1:
        return spans[0]

    temp_spans = [s for s in spans if s not in row['original']]
    if len(spans) == 1:
        return spans[0]
    if len(temp_spans) != 0:
        spans = temp_spans
    if spans == []:
        return ""
    num_toks = [len(" ".split(s)) for s in spans]
    idx = np.argmax(num_toks)
    return spans[idx]

import nltk
from nltk.util import ngrams

def measure_diversity(new_df):

    new_df['human_spans'] = new_df.apply(lambda row: get_edited_spans(row['original'], row['human_perturbed']), axis=1)
    new_df['tailor_spans'] = new_df.apply(lambda row: get_edited_spans(row['original'], row['tailor_perturbed']), axis=1)

    new_df['human_span'] = new_df.apply(lambda row: heuristically_pick_span(row, 'human_spans'), axis=1)
    new_df['tailor_span'] = new_df.apply(lambda row: heuristically_pick_span(row, 'tailor_spans'), axis=1)

    new_df['human_ngrams'] = new_df.apply(lambda row: list(ngrams(nltk.word_tokenize(row['human_span']), 1)), axis=1)
    new_df['tailor_ngrams'] = new_df.apply(lambda row: list(ngrams(nltk.word_tokenize(row['tailor_span']), 1)), axis=1)

    ngrams_dict = {}
    for key in ['tailor_ngrams', 'human_ngrams']:
        fdist = nltk.FreqDist(new_df[key].sum())
        print("\nKEY:", key)
        ngrams_dict[key] = set(fdist.keys())
        num_unique = (len(fdist.keys()))
        print(f"num unique: {num_unique}")
        num_total = len(new_df[key].sum())
        print(f"num total: {num_total}")
        print(f"ratio: {num_unique/num_total}")
        
    overlap_tokens = len(ngrams_dict['tailor_ngrams'] & ngrams_dict['human_ngrams'])
    print(f"num overlap tokens: {overlap_tokens}")
    print(f"num tailor tokens: {len(ngrams_dict['tailor_ngrams'])}")
    print(f"num human tokens: {len(ngrams_dict['human_ngrams'])}")
    return ngrams_dict

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alexisr/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [206]:
noun2verb = process_df('noun2verb.csv')
verb2noun = process_df('verb2noun.csv')

num successful: 43
validity: 0.86
num successful: 20
validity: 0.4


In [207]:
new_df = get_valid_sample(noun2verb)
measure_diversity(new_df)

7/50 invalid

KEY: tailor_ngrams
num unique: 60
num total: 77
ratio: 0.7792207792207793

KEY: human_ngrams
num unique: 54
num total: 62
ratio: 0.8709677419354839
num overlap tokens: 9
num tailor tokens: 60
num human tokens: 54


{'tailor_ngrams': {("'m",),
  ('Canada',),
  ('Comic',),
  ('Corps',),
  ('Dvoák',),
  ('Have',),
  ('I',),
  ('March',),
  ('Monday',),
  ('act',),
  ('bringing',),
  ('cases',),
  ('collects',),
  ('comprehensive',),
  ('course',),
  ('day',),
  ('different',),
  ('e-',),
  ('fair',),
  ('family',),
  ('first',),
  ('free',),
  ('get',),
  ('grounds',),
  ('justification',),
  ('kind',),
  ('know',),
  ('language',),
  ('last',),
  ('least',),
  ('life',),
  ('link',),
  ('living',),
  ('long',),
  ('make',),
  ('many',),
  ('minute',),
  ('moral',),
  ('one',),
  ('order',),
  ('orders',),
  ('papers',),
  ('past',),
  ('reasonable',),
  ('respect',),
  ('rest',),
  ('series',),
  ('sessions',),
  ('spite',),
  ('technique',),
  ('terrorist',),
  ('three',),
  ('time',),
  ('two',),
  ('version',),
  ('view',),
  ('way',),
  ('whole',),
  ('year',),
  ('years',)},
 'human_ngrams': {('2002',),
  ('November',),
  ('basis',),
  ('beginning',),
  ('benefit',),
  ('caution',),
  ('change

In [208]:
verb2noun = process_df('verb2noun.csv')
new_df = get_valid_sample(verb2noun)
measure_diversity(new_df)

num successful: 20
validity: 0.4
30/50 invalid

KEY: tailor_ngrams
num unique: 29
num total: 29
ratio: 1.0

KEY: human_ngrams
num unique: 29
num total: 29
ratio: 1.0
num overlap tokens: 2
num tailor tokens: 29
num human tokens: 29


{'tailor_ngrams': {('.',),
  ("It's'gimmick",),
  ('New',),
  ('President',),
  ('Russian',),
  ('U.S',),
  ('York',),
  ('accident',),
  ('around',),
  ('audience',),
  ('bacon',),
  ('corner',),
  ('economy',),
  ('father',),
  ('guys',),
  ('inside',),
  ('issues',),
  ('matter',),
  ('mission',),
  ('needing',),
  ('ocean',),
  ('pigeons',),
  ('profits',),
  ('run',),
  ('side',),
  ('speeding',),
  ('studying',),
  ('swans',),
  ('wide',)},
 'human_ngrams': {('2',),
  ('Europe',),
  ('Russian',),
  ('adjusted',),
  ('beef',),
  ('beginning',),
  ('bracket',),
  ('city',),
  ('complicated',),
  ('dollars',),
  ('government',),
  ('highest',),
  ('inflation',),
  ('inside',),
  ('optimization',),
  ('origin',),
  ('popular',),
  ('project',),
  ('robbery',),
  ('roundabout',),
  ('saying',),
  ('scholar',),
  ('sign',),
  ('subject',),
  ('tables',),
  ('tax',),
  ('team',),
  ('teams',),
  ('tower',)}}