In [1]:
from sibyl import init_transforms
from datasets import load_dataset
import numpy as np
from tqdm.notebook import tqdm

2022-08-03 20:49:20.568200: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


In [20]:
num_samples = 1000
dataset = load_dataset("glue", "sst2", split=f"train[:{num_samples}]")
dataset = dataset.rename_column('sentence', 'text')

Reusing dataset glue (/home/coraline/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [3]:
def all_equal(iterator):
    if isinstance(iterator[0], np.ndarray):
        first = iterator[0]
        for item in iterator:
            if not np.array_equal(first, item):
                # print(first)
                # print(item)
                return False
        return True
    elif isinstance(iterator[0], list):
        first = iterator[0]
        for item in iterator:
            if first != item:
                # print(first)
                # print(item)
                return False
        return True
    else:
        if len(set(iterator)) > 1:
            # print(set(iterator))
            return False
        return True
    
def batched_equal(iterator):
    """
    Intakes a list of lists where each sublist is a batch
    and each batch[i] are checked for equivalence
    """
    num_samples = len(iterator[0])
    return all([all_equal([l[i] for l in iterator]) for i in range(num_samples)])

# Individual Study

In [4]:
df = init_transforms(task_name="sentiment", dataset="sst2")

### Determinsitic Single Text

In [21]:
num_trials = 100
batch_size = 1
texts, targets = dataset['text'], dataset['label'] 

stochastic_transforms = set()
for i, row in tqdm(df.iterrows(), total=len(df)):
    t = row['tran_fn']
    for text, target in zip(texts, targets):
        candidate_texts, candidate_targets = [], []
        for trial in range(num_trials):
            try:
                batch = t.transform_batch(([text], [target]))
            except Exception as e: 
                print(row['transformation'])
                print(e)
                pass
            candidate_texts.extend(batch[0])
            candidate_targets.extend(batch[1])
        text_results = all_equal(candidate_texts)
        target_results = all_equal(candidate_targets)
        if not text_results or not target_results:
            stochastic_transforms.add(row['transformation'])
            break
          

  0%|          | 0/39 [00:00<?, ?it/s]

In [74]:
deterministic_transforms = set(df['transformation'].tolist()) - stochastic_transforms
deterministic_transforms

{'AddNegation',
 'AddNegativeEmoji',
 'AddNegativeLink',
 'AddNeutralEmoji',
 'AddPositiveEmoji',
 'AddPositiveLink',
 'ChangeAntonym',
 'ChangeHypernym',
 'ChangeHyponym',
 'ChangeLocation',
 'ChangeName',
 'ChangeNumber',
 'ChangeSynonym',
 'ContractContractions',
 'Demojify',
 'ExpandContractions',
 'HomoglyphSwap',
 'ImportLinkText',
 'InsertNegativePhrase',
 'InsertPositivePhrase',
 'InsertPunctuationMarks',
 'RandomCharDel',
 'RandomCharInsert',
 'RandomCharSubst',
 'RandomCharSwap',
 'RandomInsertion',
 'RandomSwap',
 'RandomSwapQwerty',
 'RemoveNegation',
 'RemoveNegativeEmoji',
 'RemoveNeutralEmoji',
 'RemovePositiveEmoji',
 'SentMix',
 'TextMix',
 'WordDeletion',
 'WordMix'}

### Reproducible Single Text

In [86]:
num_trials = 100
texts, targets = dataset['text'], dataset['label'] 

rng_state = np.random.default_rng(41).__getstate__()

outside_stochastic_transforms = set()
for i, row in tqdm(df.iterrows(), total=len(df)):  
    t = row['tran_fn']
    
    trial_texts, trial_targets = [], []
    for trial in range(num_trials):
        t.np_random.__setstate__(rng_state)
        new_texts, new_targets = [], []
        for text, target in zip(texts, targets):
            try:
                batch = t.transform_batch(([text], [target]))
            except Exception as e: 
                print(row['transformation'])
                print(e)
                pass
            new_texts.extend(batch[0])
            new_targets.extend(batch[1])
            
        trial_texts.append(new_texts)
        trial_targets.append(new_targets)
        
    is_text_equal = batched_equal(trial_texts)
    is_target_equal = batched_equal(trial_targets)

    if not is_text_equal or not is_target_equal:
        outside_stochastic_transforms.add(row['transformation'])
        continue

  0%|          | 0/39 [00:00<?, ?it/s]

In [87]:
reproducible_transforms = set(df['transformation'].tolist()) - outside_stochastic_transforms
reproducible_transforms

{'AddNegation',
 'AddNegativeEmoji',
 'AddNegativeLink',
 'AddNeutralEmoji',
 'AddPositiveEmoji',
 'AddPositiveLink',
 'ChangeAntonym',
 'ChangeHypernym',
 'ChangeHyponym',
 'ChangeLocation',
 'ChangeName',
 'ChangeNumber',
 'ChangeSynonym',
 'ContractContractions',
 'Demojify',
 'ExpandContractions',
 'HomoglyphSwap',
 'ImportLinkText',
 'InsertNegativePhrase',
 'InsertPositivePhrase',
 'InsertPunctuationMarks',
 'RandomCharDel',
 'RandomCharInsert',
 'RandomCharSubst',
 'RandomCharSwap',
 'RandomInsertion',
 'RandomSwap',
 'RandomSwapQwerty',
 'RemoveNegation',
 'RemoveNegativeEmoji',
 'RemoveNeutralEmoji',
 'RemovePositiveEmoji',
 'SentMix',
 'TextMix',
 'WordDeletion',
 'WordMix'}

In [89]:
outside_stochastic_transforms

{'Concept2Sentence', 'ConceptMix', 'Emojify'}

# Batched Study

### Determinisitc Batch

In [78]:
batched_df = df[df['transformation'].isin(deterministic_transforms)]

In [24]:
text, label = dataset['text'], dataset['label'] 

num_trials = 100
batch_size= 10

stochastic_transforms = set()
for i, row in tqdm(batched_df.iterrows(), total=len(batched_df)):
    transform = row['tran_fn']
    
    trial_texts, trial_targets = [], []
    for trial in range(num_trials):
        
        new_texts, new_targets = [], []
        for i in range(0, len(label), batch_size):
            text_batch = text[i:i+batch_size]
            label_batch = label[i:i+batch_size]
            batch = (text_batch, label_batch)
            batch = transform.transform_batch(batch)
            new_texts.extend(batch[0])
            new_targets.extend(batch[1])
    
        trial_texts.append(new_texts)
        trial_targets.append(new_targets)
        
    is_text_equal = batched_equal(trial_texts)
    is_target_equal = batched_equal(trial_targets)

    if not is_text_equal or not is_target_equal:
        stochastic_transforms.add(row['transformation'])
        continue

  0%|          | 0/11 [00:00<?, ?it/s]

In [25]:
set(batched_df['transformation'].tolist()) - stochastic_transforms

{'AddNegation',
 'ChangeNumber',
 'ContractContractions',
 'Demojify',
 'ExpandContractions',
 'ImportLinkText',
 'RemoveNegation',
 'RemoveNegativeEmoji',
 'RemoveNeutralEmoji',
 'RemovePositiveEmoji'}

### Reproducible Batch

In [90]:
batched_df = df[df['transformation'].isin(reproducible_transforms)]

In [None]:
text, label = dataset['text'], dataset['label'] 
rng_state = np.random.default_rng(41).__getstate__()

num_trials = 100
batch_size= 10

batched_stochastic_transforms = set()
for _, row in tqdm(batched_df.iterrows(), total=len(batched_df)):
    transform = row['tran_fn']
    
    trial_texts, trial_targets = [], []
    for trial in range(num_trials):
        transform.np_random.__setstate__(rng_state)
        new_texts, new_targets = [], []
        for i in range(0, len(label), batch_size):
            text_batch = text[i:i+batch_size]
            label_batch = label[i:i+batch_size]
            batch = (text_batch, label_batch)
            batch = transform.transform_batch(batch)
            new_texts.extend(batch[0])
            new_targets.extend(batch[1])
    
        trial_texts.append(new_texts)
        trial_targets.append(new_targets)
        
    is_text_equal = batched_equal(trial_texts)
    is_target_equal = batched_equal(trial_targets)

    if not is_text_equal or not is_target_equal:
        batched_stochastic_transforms.add(row['transformation'])
        continue

  0%|          | 0/36 [00:00<?, ?it/s]

In [None]:
batched_stochastic_transforms

In [82]:
reproducible_transforms = set(batched_df['transformation'].tolist()) - batched_stochastic_transforms
reproducible_transforms

{'AddNegation',
 'AddNegativeEmoji',
 'AddNegativeLink',
 'AddNeutralEmoji',
 'AddPositiveEmoji',
 'AddPositiveLink',
 'ChangeAntonym',
 'ChangeHypernym',
 'ChangeHyponym',
 'ChangeLocation',
 'ChangeName',
 'ChangeNumber',
 'ChangeSynonym',
 'ContractContractions',
 'Demojify',
 'ExpandContractions',
 'HomoglyphSwap',
 'ImportLinkText',
 'InsertNegativePhrase',
 'InsertPositivePhrase',
 'InsertPunctuationMarks',
 'RandomCharDel',
 'RandomCharInsert',
 'RandomCharSubst',
 'RandomCharSwap',
 'RandomInsertion',
 'RandomSwap',
 'RandomSwapQwerty',
 'RemoveNegation',
 'RemoveNegativeEmoji',
 'RemoveNeutralEmoji',
 'RemovePositiveEmoji',
 'SentMix',
 'TextMix',
 'WordDeletion',
 'WordMix'}

In [83]:
len(reproducible_transforms)

36

In [85]:
stochastic_transforms

{'Concept2Sentence', 'ConceptMix', 'Emojify'}