In [None]:
from sibyl import init_transforms
from datasets import load_dataset
import numpy as np
from tqdm.notebook import tqdm

In [None]:
num_samples = 1000
dataset = load_dataset("glue", "sst2", split=f"train[:{num_samples}]")
dataset = dataset.rename_column('sentence', 'text')

In [None]:
def all_equal(iterator):
    if isinstance(iterator[0], np.ndarray):
        first = iterator[0]
        for item in iterator:
            if not np.array_equal(first, item):
                # print(first)
                # print(item)
                return False
        return True
    elif isinstance(iterator[0], list):
        first = iterator[0]
        for item in iterator:
            if first != item:
                # print(first)
                # print(item)
                return False
        return True
    else:
        if len(set(iterator)) > 1:
            # print(set(iterator))
            return False
        return True
    
def batched_equal(iterator):
    """
    Intakes a list of lists where each sublist is a batch
    and each batch[i] are checked for equivalence
    """
    num_samples = len(iterator[0])
    return all([all_equal([l[i] for l in iterator]) for i in range(num_samples)])

# Individual Study

In [None]:
df = init_transforms(task_name="sentiment", dataset="sst2")

In [None]:
num_trials = 100
batch_size = 1
texts, targets = dataset['text'], dataset['label'] 

stochastic_transforms = set()
for i, row in tqdm(df.iterrows(), total=len(df)):
    t = row['tran_fn']
    for text, target in zip(texts, targets):
        candidate_texts, candidate_targets = [], []
        for trial in range(num_trials):
            try:
                batch = t.transform_batch(([text], [target]))
            except Exception as e: 
                print(row['transformation'])
                print(e)
                pass
            candidate_texts.extend(batch[0])
            candidate_targets.extend(batch[1])
        text_results = all_equal(candidate_texts)
        target_results = all_equal(candidate_targets)
        if not text_results or not target_results:
            stochastic_transforms.add(row['transformation'])
            break
          

In [None]:
deterministic_transforms = set(df['transformation'].tolist()) - stochastic_transforms

# Batched Study

In [None]:
batched_df = df[df['transformation'].isin(deterministic_transforms)]

In [None]:
text, label = dataset['text'], dataset['label'] 

num_trials = 100
batch_size= 10

stochastic_transforms = set()
for i, row in tqdm(batched_df.iterrows(), total=len(df)):
    transform = row['tran_fn']
    
    trial_texts, trial_targets = [], []
    for trial in range(num_trials):
        
        new_texts, new_targets = [], []
        for i in range(0, len(label), batch_size):
            text_batch = text[i:i+batch_size]
            label_batch = label[i:i+batch_size]
            batch = (text_batch, label_batch)
            batch = transform.transform_batch(batch)
            new_texts.extend(batch[0])
            new_targets.extend(batch[1])
    
        trial_texts.append(new_texts)
        trial_targets.append(new_targets)
        
    is_text_equal = batched_equal(trial_texts)
    is_target_equal = batched_equal(trial_targets)

    if not is_text_equal or not is_target_equal:
        stochastic_transforms.add(row['transformation'])
        continue

In [None]:
set(df['transformation'].tolist()) - stochastic_transforms

In [None]:
{'AddNegation',
 'RemoveNegation',
 'ChangeNumber',
 'ContractContractions',
 'ExpandContractions'}
 # 'Demojify',
 # 'RemoveNegativeEmoji',
 # 'RemoveNeutralEmoji',
 # 'RemovePositiveEmoji'
 # 'ImportLinkText',
}