### Jupyter Notebook to Create Data Augmentations for Adversarial Training 

We Plan to create 4 Different Augmented Datasets based on different recipes of making 

1) EmbeddingAugmenter

2) SynonymInsertionAugmenter

3) WordNetAugmenter 

4) BackTranslationAugmenter

In [3]:
import numpy as np
import pandas as pd
import torch
import transformers
import warnings
import time
import csv        


def get_sst_examples(input_file, test=False, discard_values = 0.5):

    train_examples = []
    test_examples = []

    with open(input_file, 'r') as f:

        contents = f.read()
        file_as_list = contents.splitlines()
        for line in file_as_list[1:]:
            
            is_dropped = np.random.binomial(1, discard_values, 1)[0]
            
            if not test and is_dropped == 1:
                continue
                
                
            text, label = line.split("\t") 
            if test:
                test_examples.append((text, label))
            else : 
                train_examples.append((text, label))
        f.close()

    return train_examples, test_examples


In [4]:
labeled_examples, _ = get_sst_examples('./../../data/SST-2/train.tsv',test=False,discard_values = 0)

len(labeled_examples)

67349

#### Let the new Data Augmentation have the original 60K .
#### New 20K data will be created by augmenter in random.
#### 2 Variations for 10K Examples 

In [30]:
def generate_augmented_examples(input_examples,output_tsv,Augmenter , pct_words_to_swap=0.25 , transformations_per_example = 2):
    print(f"Length of Original Document - {len(input_examples)} \n")
    
    augmented_examples = [] 
    
    print(f"Initiating Creation of Data Augmentation\n")
    
    rng = np.random.default_rng() 
    
    augmented_indexes = rng.choice(len(input_examples), 20_000, replace=False)
    
    augmenter = Augmenter(pct_words_to_swap = pct_words_to_swap, transformations_per_example = transformations_per_example)
    
    for index in augmented_indexes : 
        
        augmented_strings = augmenter.augment(input_examples[index][0])
        
        augmented_examples += [(x,input_examples[index][1]) for x in augmented_strings]
        
        if len(augmented_examples) % 10 == 0 : 
            print(f"Generated {len(augmented_examples)} out of 20_000 Examples ", end = "\r")
    
    print(f"Data Generated , Writing it to Augmented Tab Separated Format {output_tsv} : ")
    
    with open(output_tsv, 'w', newline='') as f:
        writer = csv.writer(f, delimiter='\t')
        
        for examples in augmented_examples:
            writer.writerow(examples)
    
    print(f"All Output Written to {output_tsv}")


In [33]:
from textattack.augmentation.recipes import EmbeddingAugmenter,SynonymInsertionAugmenter

generate_augmented_examples(labeled_examples,"easydataaugmented.tsv",SynonymInsertionAugmenter, pct_words_to_swap=0.2 , transformations_per_example = 2)

Length of Original Document - 67349 

Initiating Creation of Data Augmentation

Data Generated , Writing it to Augmented Tab Separated Format easydataaugmented.tsv : 


In [16]:
%%timeit -n 1 -r 1 
# EmbeddingAugmenter 

# Alter default values if desired

augmenter = EmbeddingAugmenter(pct_words_to_swap=0.2, transformations_per_example=5)
s = labeled_examples[2][0]# Augment


print(augmenter.augment(s))

['that adore its characters and communicates something rather glamorous about human nature ', 'that loves its characters and communicates something fairly leggy about human nature ', 'that loves its characters and communicates something rather exquisite about human personages ', 'that loves its characters and communicates something rather resplendent about human personage ', 'that loves its features and communicates something rather fantastic about human nature ']
431 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [17]:
%%timeit -n 1 -r 1 
# SynonymInsertionAugmenter 


from textattack.augmentation.recipes import SynonymInsertionAugmenter
# Alter default values if desired

augmenter = SynonymInsertionAugmenter(pct_words_to_swap=0.2, transformations_per_example=5)
s = labeled_examples[i][0]# Augment


print(augmenter.augment(s))

NameError: name 'i' is not defined

In [31]:
%%timeit -n 1 -r 1 
# WordNetAugmenter 

from textattack.augmentation.recipes import WordNetAugmenter
# Alter default values if desired

augmenter = WordNetAugmenter(pct_words_to_swap=0.2, transformations_per_example=5)
s = labeled_examples[i][0]# Augment


print(augmenter.augment(s))

['equal the original and in some room even betters it ', 'equals the original and in some path fifty-fifty betters it ', 'equals the original and in some ways eventide amend it ', 'equals the pilot and in some ways flush betters it ', 'match the original and in some ways even meliorate it ']
44.1 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [29]:
%%timeit -n 1 -r 1 
# BackTranslationAugmenter


from textattack.augmentation.recipes import BackTranslationAugmenter
# Alter default values if desired

augmenter = BackTranslationAugmenter(pct_words_to_swap=0.2, transformations_per_example=5)


s = labeled_examples[i][0]# Augment


print(augmenter.augment(s))

['And the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same, and the same,', 'equal to the original and somehow amplifies even more', 'equal to the original and, in a similar way,', 'equal to the original and, no doubt, the equivalent', 'equal to the original and, to a certain extent, to the width of the year']
1min 19s ± 0 ns per loop (mean ± std. dev. of 

In [22]:
# %%timeit -n 1 -r 1 
# # CheckListAugmenter

# from textattack.augmentation.recipes import EasyDataAugmenter
# # Alter default values if desired

# augmenter = EasyDataAugmenter(pct_words_to_swap=0.2, transformations_per_example=5)
# s = labeled_examples[i][0]# Augment


# print(augmenter.augment(s))

['equals the and in some ways betters it ', 'equate the original and in some ways even bettor it ', 'even the original and in some ways equals betters it ', 'equals the original and in some ways even betters path it ']
1.05 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


https://towardsdatascience.com/what-are-adversarial-examples-in-nlp-f928c574478e

In [18]:
# %%timeit -n 1 -r 1 


# from textattack.augmentation.recipes import CharSwapAugmenter
# # Alter default values if desired

# augmenter = CharSwapAugmenter(pct_words_to_swap=0.2, transformations_per_example=5)
# s = labeled_examples[i][0]# Augment


# print(augmenter.augment(s))

['equalQ the original and in some ways even bteters it ', 'equals the original and in some ways evne bAtters it ', 'equals the original and in some ways evne bRtters it ', 'equals the ozriginal and in some ways even better it ', 'equals the riginal and in some ways ven betters it ']
23.7 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [21]:
# %%timeit -n 1 -r 1 
# # CheckListAugmenter

# from textattack.augmentation.recipes import DeletionAugmenter
# # Alter default values if desired

# augmenter = DeletionAugmenter(pct_words_to_swap=0.2, transformations_per_example=5)
# s = labeled_examples[i][0]# Augment


# print(augmenter.augment(s))

['equals original and in some ways even it ', 'equals original and in ways even betters it ', 'equals the original and in some even it ', 'the original and in some ways even betters ', 'the original and in some ways even it ']
14.7 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [33]:
# SwapAugmenter

from textattack.augmentation.recipes import SwapAugmenter
# Alter default values if desired

augmenter = SwapAugmenter(pct_words_to_swap=0.2, transformations_per_example=5)
s = labeled_examples[i][0]# Augment


print(augmenter.augment(s))

['equals the and original in some ways even betters it ', 'equals the original and in some ways betters even it ', 'equals the original in and some ways even betters it ', 'it the original and in some ways even betters equals ', 'the equals original and in some ways even betters it ']
