# DPML | Augmentation

This notebook explores how to 1) load text classifcation datasets, 2) augment those datasets. 

## Datasets

In [1]:
import os
from datasets import load_dataset, load_from_disk

In [2]:
DATASET_CONFIGS = [
    "imdb",
    "yelp_polarity",
    "amazon_polarity",
    "ag_news",
    "yahoo_answers_topics",
    "dbpedia_14"
]

In [3]:
all_datasets = {}
for d_name in DATASET_CONFIGS:  
    print(d_name)
    
    # check if datasets are already created
    if os.path.exists("./datasets/" + d_name):
        print("found existing dataset. loading from disk...")
        try:
            all_datasets[d_name] = load_from_disk("datasets/" + d_name)
            continue
        except Exception as e: 
            print(e)
            continue
            
    # does not exist, preprocess
    print("no existing dataset found. loading from server...")
    try:
        all_datasets[d_name] = load_dataset(d_name)
    except Exception as e: 
        print(e)
    
    if d_name in ["amazon_polarity", "dbpedia_14"]:
        for d_split_name, d_split in all_datasets[d_name].items():
            all_datasets[d_name][d_split_name] = d_split.map(lambda example : 
                                  {'text' : example['title'] + " " + example['content'],
                                   'label': example['label']})
            all_datasets[d_name][d_split_name] = all_datasets[d_name][d_split_name].remove_columns(["title", 
                                                              "content"])
    if d_name == "yahoo_answers_topics":
        for d_split_name, d_split in all_datasets[d_name].items():
            all_datasets[d_name][d_split_name] = d_split.map(lambda example : 
                                  {'text' : example['question_title'] + " " + 
                                            example['question_content'] + " " +
                                            example['best_answer'],
                                   'label': example['topic']}) 
            all_datasets[d_name][d_split_name] = all_datasets[d_name][d_split_name].remove_columns(["id", 
                                                                                 "question_title", 
                                                                                 "question_content", 
                                                                                 "best_answer", 
                                                                                 "topic"])
    all_datasets[d_name].save_to_disk("./datasets/" + d_name)

imdb
found existing dataset. loading from disk...
yelp_polarity
found existing dataset. loading from disk...
amazon_polarity
found existing dataset. loading from disk...
__init__() got an unexpected keyword argument 'shard_lengths'
ag_news
found existing dataset. loading from disk...
__init__() missing 1 required positional argument: 'labels'
yahoo_answers_topics
found existing dataset. loading from disk...
__init__() got an unexpected keyword argument 'shard_lengths'
dbpedia_14
found existing dataset. loading from disk...


In [4]:
all_datasets

{'imdb': DatasetDict({
     train: Dataset({
         features: ['label', 'text'],
         num_rows: 25000
     })
     test: Dataset({
         features: ['label', 'text'],
         num_rows: 25000
     })
     unsupervised: Dataset({
         features: ['label', 'text'],
         num_rows: 50000
     })
 }),
 'yelp_polarity': DatasetDict({
     train: Dataset({
         features: ['label', 'text'],
         num_rows: 560000
     })
     test: Dataset({
         features: ['label', 'text'],
         num_rows: 38000
     })
 }),
 'dbpedia_14': DatasetDict({
     train: Dataset({
         features: ['label', 'text'],
         num_rows: 560000
     })
     test: Dataset({
         features: ['label', 'text'],
         num_rows: 70000
     })
 })}

## Augmentations

### NL-Augmenter

In [5]:
# !git clone https://www.github.com/GEM-benchmark/NL-Augmenter
# %cd NL-Augmenter
# # may encounter a codec error on windows, add encoding='utf-8' to open() in setup.py
# !python setup.py sdist
# !pip install -e .
# !pip install -r requirements.txt --quiet
# !pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.0.0/en_core_web_sm-3.0.0.tar.gz
# !pip install cucco fastpunct pronouncing piglatin

In [6]:
# import os
# import sys
# import types
# import importlib
# import inspect
# import glob

# from nlaugmenter.interfaces.SentenceOperation import SentenceOperation
# from nlaugmenter.tasks.TaskTypes import TaskType

# sys.path.insert(1, './nlaugmenter/transformations/')

In [7]:
# transform_paths = glob.glob("./nlaugmenter/transformations/*/transformation.py")

# black_list = [
#     './nlaugmenter/transformations/formality_change/transformation.py',
#     './nlaugmenter/transformations/back_translation_ner/transformation.py',
#     './nlaugmenter/transformations/concept2sentence/transformation.py',
#     './nlaugmenter/transformations/mixed_language_perturbation/transformation.py',
#     './nlaugmenter/transformations/syntactically_diverse_paraphrase/transformation.py',
#     './nlaugmenter/transformations/ocr_perturbation/transformation.py',
#     './nlaugmenter/transformations/chinese_butter_fingers_perturbation/transformation.py',
#     './nlaugmenter/transformations/french_conjugation_transformation/transformation.py',
#     './nlaugmenter/transformations/number-to-word/transformation.py',
# ]

# transform_paths = [p for p in transform_paths if p not in black_list]

# acceptable_languages = ['en', 'all']

In [8]:
# target_classes = []
# for transform_path in transform_paths:
#     print(transform_path)
#     path, _ = os.path.splitext(transform_path)
#     file_name = path.split('/')[-2]
#     module = importlib.import_module(file_name, path)
#     classes = inspect.getmembers(module, inspect.isclass)
#     print(classes)
#     for cls in classes:
#         transform_class = getattr(module, cls[0])
#         if issubclass(transform_class, SentenceOperation):
#             if transform_class == SentenceOperation:
#                 continue
#             if hasattr(transform_class, 'languages'):
#                 print(transform_class.languages)
#                 if transform_class.languages is None or any(l.lower() in transform_class.languages for l in acceptable_languages) :
#                     if hasattr(transform_class, 'tasks'):
#                         print(transform_class.tasks)
#                         if TaskType.TEXT_CLASSIFICATION in transform_class.tasks:
#                             target_classes.append(transform_class)

In [9]:
# target_imports = [('nlaugmenter.transformations.' + c.__module__.split('.')[-2], c.__name__) for c in target_classes]

# for (target_module, target_class) in target_imports:
#     print(f"from {target_module} import {target_class}")

In [10]:
from nlaugmenter.transformations.emojify                                   import EmojifyTransformation
from nlaugmenter.transformations.synonym_insertion                         import SynonymInsertion
from nlaugmenter.transformations.back_translation                          import BackTranslation
from nlaugmenter.transformations.noun_compound_paraphraser                 import NounCompoundParaphraser
from nlaugmenter.transformations.diacritic_removal                         import DiacriticRemoval
from nlaugmenter.transformations.azerty_qwerty_chars_swap                  import AzertyQwertyCharsSwap
from nlaugmenter.transformations.add_hashtags                              import HashtagGeneration
from nlaugmenter.transformations.replace_spelling                          import SpellingTransformation
from nlaugmenter.transformations.token_replacement                         import TokenReplacement
from nlaugmenter.transformations.auxiliary_negation_removal                import SentenceAuxiliaryNegationRemoval
from nlaugmenter.transformations.use_acronyms                              import UseAcronyms
from nlaugmenter.transformations.yoda_transform                            import YodaPerturbation
from nlaugmenter.transformations.factive_verb_transformation               import FactiveVerbTransformation
from nlaugmenter.transformations.protaugment_diverse_paraphrase            import ProtaugmentDiverseParaphrase
from nlaugmenter.transformations.replace_financial_amounts                 import ReplaceFinancialAmount
from nlaugmenter.transformations.americanize_britishize_english            import AmericanizeBritishizeEnglish
from nlaugmenter.transformations.punctuation                               import PunctuationWithRules
from nlaugmenter.transformations.urban_dict_swap                           import UrbanThesaurusSwap
from nlaugmenter.transformations.filler_word_augmentation                  import FillerWordAugmentation
from nlaugmenter.transformations.synonym_substitution                      import SynonymSubstitution
from nlaugmenter.transformations.style_paraphraser                         import StyleTransferParaphraser
from nlaugmenter.transformations.random_upper_transformation               import RandomUpperPerturbation
from nlaugmenter.transformations.weekday_month_abbreviation                import WeekdayMonthAbbreviation
from nlaugmenter.transformations.city_names_transformation                 import CityNamesTransformation
from nlaugmenter.transformations.sentence_additions                        import SentenceAdditions
from nlaugmenter.transformations.change_date_format                        import ChangeDateFormat
from nlaugmenter.transformations.tense                                     import TenseTransformation
from nlaugmenter.transformations.dyslexia_words_swap                       import DyslexiaWordsSwap
from nlaugmenter.transformations.gender_culture_diverse_name               import GenderCultureDiverseName
from nlaugmenter.transformations.gender_neutral_rewrite                    import GenderNeutralRewrite
from nlaugmenter.transformations.lost_in_translation                       import LostInTranslation
from nlaugmenter.transformations.insert_abbreviation                       import AbbreviationInsertionEN
from nlaugmenter.transformations.random_deletion                           import RandomDeletion
from nlaugmenter.transformations.transformer_fill                          import TransformerFill
from nlaugmenter.transformations.concat_monolingual                        import ConcatMonolingual
from nlaugmenter.transformations.country_state_abbreviation_transformation import CountryStateAbbreviation
from nlaugmenter.transformations.butter_fingers_perturbation               import ButterFingersPerturbation
from nlaugmenter.transformations.replace_abbreviation_and_acronyms         import ReplaceAbbreviations
from nlaugmenter.transformations.slangificator                             import Slangificator
from nlaugmenter.transformations.yes_no_question                           import YesNoQuestionPerturbation
from nlaugmenter.transformations.gender_swap                               import GenderSwap
from nlaugmenter.transformations.close_homophones_swap                     import CloseHomophonesSwap
from nlaugmenter.transformations.simple_ciphers                            import SimpleCiphers
from nlaugmenter.transformations.change_person_named_entities              import ChangePersonNamedEntities
from nlaugmenter.transformations.greetings_and_farewells                   import GreetingsAndFarewells
from nlaugmenter.transformations.discourse_marker_substitution             import DiscourseMarkerSubstitution
from nlaugmenter.transformations.summarization_transformation              import Summarization
from nlaugmenter.transformations.visual_attack_letters                     import VisualAttackLetters
from nlaugmenter.transformations.sentence_reordering                       import SentenceReordering
from nlaugmenter.transformations.change_char_case                          import ChangeCharCase
from nlaugmenter.transformations.antonyms_substitute                       import AntonymsSubstitute
from nlaugmenter.transformations.mix_transliteration                       import MixTransliteration
from nlaugmenter.transformations.disability_transformation                 import DifferentAbilityTransformation
from nlaugmenter.transformations.abbreviation_transformation               import Abbreviate
from nlaugmenter.transformations.replace_with_hyponyms_hypernyms           import ReplaceHypernyms
from nlaugmenter.transformations.replace_with_hyponyms_hypernyms           import ReplaceHyponyms
from nlaugmenter.transformations.grapheme_to_phoneme_transformation        import PhonemeSubstitution
from nlaugmenter.transformations.multilingual_lexicon_perturbation         import MultilingualLexiconPerturbation
from nlaugmenter.transformations.multilingual_back_translation             import MultilingualBackTranslation
from nlaugmenter.transformations.unit_converter                            import UnitConverter
from nlaugmenter.transformations.adjectives_antonyms_switch                import SentenceAdjectivesAntonymsSwitch
from nlaugmenter.transformations.correct_common_misspellings               import CorrectCommonMisspellings
from nlaugmenter.transformations.neural_question_paraphraser               import NeuralParaphaserPerturbation
from nlaugmenter.transformations.subject_object_switch                     import SentenceSubjectObjectSwitch
from nlaugmenter.transformations.numeric_to_word                           import NumericToWord
from nlaugmenter.transformations.multilingual_dictionary_based_code_switch import MultilingualDictionaryBasedCodeSwitch
from nlaugmenter.transformations.diverse_paraphrase                        import DiverseParaphrase
from nlaugmenter.transformations.english_inflectional_variation            import EnglishInflectionalVariation
from nlaugmenter.transformations.replace_numerical_values                  import ReplaceNumericalValues
from nlaugmenter.transformations.speech_disfluency_perturbation            import SpeechDisfluencyPerturbation
from nlaugmenter.transformations.whitespace_perturbation                   import WhitespacePerturbation
from nlaugmenter.transformations.contraction_expansions                    import ContractionExpansions
from nlaugmenter.transformations.color_transformation                      import ColorTransformation
from nlaugmenter.transformations.contextual_meaning_perturbation           import ContextualMeaningPerturbation
from nlaugmenter.transformations.pig_latin                                 import PigLatin
from nlaugmenter.transformations.leet_letters                              import LeetLetters
from nlaugmenter.transformations.hashtagify                                import HashtagifyTransformation
from nlaugmenter.transformations.propbank_srl_roles                        import CheckSrl
from nlaugmenter.transformations.geo_names_transformation                  import GeoNamesTransformation



In [11]:
nlaug_transforms = [
    EmojifyTransformation,
    SynonymInsertion,
    BackTranslation, # runtime too long (does not attempt to use gpu) (50.15s/ba w gpu)
    # NounCompoundParaphraser, # does not return anything
    DiacriticRemoval,
    AzertyQwertyCharsSwap,
    HashtagGeneration,
    SpellingTransformation,
    TokenReplacement,
    SentenceAuxiliaryNegationRemoval,
    UseAcronyms,
    YodaPerturbation,
    # FactiveVerbTransformation, # cannot find gender.male.names
    ProtaugmentDiverseParaphrase, # somewhat long runtime (29.63s/ba)
    ReplaceFinancialAmount,
    AmericanizeBritishizeEnglish,
    # PunctuationWithRules,  # runtime too long (does not appear to be gpu compatible)
    # UrbanThesaurusSwap, # runtime too long (195.74s/ba)
    FillerWordAugmentation,
    SynonymSubstitution,
    # StyleTransferParaphraser, # runtime too long (192.32s/ba)
    RandomUpperPerturbation,
    WeekdayMonthAbbreviation,
    CityNamesTransformation,
    # SentenceAdditions, # runtime too long (does not attempt to use gpu) # RuntimeError: CUDA out of memory. even at batch_size=1
    ChangeDateFormat,
    TenseTransformation, # index out of range errors / NoneType errors if not using to_tense='past'
    DyslexiaWordsSwap,
    # GenderCultureDiverseName, # does not return anything
    # GenderNeutralRewrite, # index out of range errors
    # LostInTranslation, # IndexError: index out of range in self
    AbbreviationInsertionEN,
    RandomDeletion,
    TransformerFill, # (3.20s/ba)
    ConcatMonolingual,
    CountryStateAbbreviation,
    ButterFingersPerturbation,
    ReplaceAbbreviations, # (4.52s/ba)
    Slangificator,
    YesNoQuestionPerturbation,
    GenderSwap,
    # CloseHomophonesSwap, # runs slow, no gpu (221.04s/ba)
    SimpleCiphers,
    # ChangePersonNamedEntities, # does not return anything
    GreetingsAndFarewells,
    DiscourseMarkerSubstitution,
    Summarization,
    VisualAttackLetters,
    SentenceReordering, # somwhat slow (41.14s/ba)
    ChangeCharCase,
    AntonymsSubstitute,
    # MixTransliteration, # IndexError: list index out of range
    # DifferentAbilityTransformation, UnboundLocalError: local variable 'text' referenced before assignment
    Abbreviate,
    # ReplaceHypernyms, # RuntimeError: CUDA out of memory. even at batch_size=1
    # ReplaceHyponyms, # RuntimeError: CUDA out of memory. even at batch_size=1
    # PhonemeSubstitution, # IndexError: list index out of range
    # MultilingualLexiconPerturbation, FileNotFoundError: [Errno 2] No such file or directory: '/multilingual_lexicon_uncased.xz'
    MultilingualBackTranslation, # runtime too long (does not attempt to use gpu) (98.04s/ba w gpu)
    UnitConverter,
    SentenceAdjectivesAntonymsSwitch,
    CorrectCommonMisspellings,
    NeuralParaphaserPerturbation, # somewhat slow (4.03s/ba)
    SentenceSubjectObjectSwitch,
    # NumericToWord, # InvalidOperation: [<class 'decimal.ConversionSyntax'>]
    MultilingualDictionaryBasedCodeSwitch,
    # DiverseParaphrase, # runtime too long (does not attempt to use gpu)
    EnglishInflectionalVariation,
    ReplaceNumericalValues,
    SpeechDisfluencyPerturbation,
    WhitespacePerturbation,
    ContractionExpansions,
    ColorTransformation,
    # ContextualMeaningPerturbation, # runtime too long (not gpu compatible)
    PigLatin,
    LeetLetters,
    # HashtagifyTransformation, # error: nothing to repeat at position 0
    # CheckSrl, # RuntimeError: The size of tensor a (1127) must match the size of tensor b (512) at non-singleton dimension 1
    # GeoNamesTransformation, # does not return anything
]

In [12]:
import os
import random
import inspect
from functools import partial
import itertools
import torch

In [13]:
class Transform:
    def __init__(self, transform_class, num_outputs=1):
        self.transform_class = transform_class
        self.num_outputs = num_outputs
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.intakes_target = False
        self.is_batched = False
        
        # setting class attributes
        if 'to_tense' in inspect.signature(self.transform_class).parameters:
            print("initializing class with to_tense='past'") # future & random don't work
            self.transform_instance = self.transform_class(to_tense="past")
        elif 'source_lang' in inspect.signature(self.transform_class).parameters:
            print("initializing class with source_lang='es'") 
            self.transform_instance = self.transform_class(source_lang="es")
        elif isinstance(self.transform_class, LostInTranslation):
            self.transform_instance = self.transform_class(device=0)
        else:
            self.transform_instance = self.transform_class()
        
        # setting instance attributes
        if hasattr(self.transform_instance, "max_outputs"):
            print(f"setting max_outputs={self.num_outputs}")
            self.transform_instance.max_outputs = self.num_outputs
        if hasattr(self.transform_instance, "max_paraphrases"):
            print(f"setting max_paraphrases={self.num_outputs}")
            self.transform_instance.max_paraphrases = self.num_outputs
        if hasattr(self.transform_instance, "device"):
            if self.transform_instance.device is None or self.transform_instance.device == 'cpu':
                print(f"setting device={self.device}")
                self.transform_instance.device = self.device
        
        # selecting the transformation function
        if hasattr(self.transform_class, "generate"):
            self.transform_fn = self.transform_instance.generate
        if hasattr(self.transform_class, "augment"):
            self.transform_fn = self.transform_instance.augment
        if hasattr(self.transform_class, "transform_batch"):
            self.transform_fn = self.transform_instance.transform_batch
            self.intakes_target = True
            self.is_batched = True
            
    def synced_shuffle(self, list1, list2):
        # Shuffle two lists with same order
        # Using zip() + * operator + shuffle()
        temp = list(zip(list1, list2))
        random.shuffle(temp)
        res1, res2 = zip(*temp)
        # res1 and res2 come out as tuples, and so must be converted to lists.
        res1, res2 = list(res1), list(res2)
        return res1, res2
            
    def apply(self, texts, labels=None):
        if self.intakes_target:
            if self.is_batched:
                new_texts, new_labels = self.transform_fn((texts, labels))
            else:
                new_texts, new_labels = [], []
                for t, l in zip(texts, labels):
                    new_t, new_l = self.transform_fn(t, l)
                    new_texts.append(new_t)
                    new_labels.extend([new_l] * len(new_t))
        else:
            if self.is_batched:
                new_texts = self.transform_fn((texts))
                new_texts = labels
            else:
                new_texts, new_labels = [], []
                for t, l in zip(texts, labels):
                    new_t = self.transform_fn(t)
                    if len(new_t) > self.num_outputs:
                        new_t = new_t[:self.num_outputs]
                    new_texts.extend(new_t)
                    new_labels.extend([l] * len(new_t))
                    
        # print(new_texts)
        # print(new_labels)
        
        # post processing since some transformations add/remove more new outputs than expected
        if len(new_texts) == 0:
            print("no new_texts, substituting original texts...")
            new_texts = texts
        if len(new_labels) == 0:
            print("no new_labels, substituting original labels...")
            new_labels = labels
        new_texts, new_labels = self.synced_shuffle(new_texts, new_labels)
        
        expected_len = len(texts) * self.num_outputs
        new_texts = new_texts[:expected_len]
        new_labels = new_labels[:expected_len]
        
        return new_texts, new_labels

In [14]:
def augment_data(batch, transform, keep_originals=True):
    new_texts, new_labels = [], []
    for text, label in zip(batch['text'], batch['label']):
        new_text, new_label = transform.apply([text], [label])
        new_texts.extend(new_text)
        new_labels.append(new_label)
    if keep_originals:
        return {"text": batch['text'] + new_texts, "label": batch['label'] + new_labels}
    else:
        return {"text": new_texts, "label": new_labels}

In [15]:
# t = Transform(DifferentAbilityTransformation)
# t.apply(["testing is important", "having fun is important too"], [0, 1])

In [16]:
# control variables
save_dir       = "./datasets/"
dataset_size   = 1000
batch_size     = 10
keep_originals = False
seed           = 130


# augment
for dataset_name, dataset in all_datasets.items():
    
    # load dataset
    dataset = dataset['train'].shuffle(seed=seed)
    if dataset_size > 0:
        dataset = dataset.select(range(dataset_size))
        
    for t in nlaug_transforms:

        t_name = t.__name__
        save_path = os.path.join(save_dir, 
            f"{dataset_name}_{t_name}{'_' + str(dataset_size) if dataset_size > 0 else ''}_KO{'T' if keep_originals else 'F'}")

        if os.path.exists(save_path):
            print(f"Existing dataset found save at {save_path}")
        else:
            try:
                print(f"Processing {t_name} to save at {save_path}")
                aug = partial(augment_data, transform=Transform(t), keep_originals=keep_originals)
                aug_dataset = dataset.map(aug, batched=True, batch_size=batch_size)
                aug_dataset.save_to_disk(save_path)
            except Exception as e: 
                print(e)
                continue



Existing dataset found save at ./datasets/imdb_EmojifyTransformation_1000_KOF
Existing dataset found save at ./datasets/imdb_SynonymInsertion_1000_KOF
Existing dataset found save at ./datasets/imdb_BackTranslation_1000_KOF
Existing dataset found save at ./datasets/imdb_DiacriticRemoval_1000_KOF
Existing dataset found save at ./datasets/imdb_AzertyQwertyCharsSwap_1000_KOF
Existing dataset found save at ./datasets/imdb_HashtagGeneration_1000_KOF
Existing dataset found save at ./datasets/imdb_SpellingTransformation_1000_KOF
Existing dataset found save at ./datasets/imdb_TokenReplacement_1000_KOF
Existing dataset found save at ./datasets/imdb_SentenceAuxiliaryNegationRemoval_1000_KOF
Existing dataset found save at ./datasets/imdb_UseAcronyms_1000_KOF
Existing dataset found save at ./datasets/imdb_YodaPerturbation_1000_KOF
Existing dataset found save at ./datasets/imdb_ProtaugmentDiverseParaphrase_1000_KOF
Existing dataset found save at ./datasets/imdb_ReplaceFinancialAmount_1000_KOF
Existi

HBox(children=(FloatProgress(value=0.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (1213 > 1024). Running this sequence through the model will result in indexing errors



Existing dataset found save at ./datasets/yelp_polarity_DiacriticRemoval_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_AzertyQwertyCharsSwap_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_HashtagGeneration_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_SpellingTransformation_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_TokenReplacement_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_SentenceAuxiliaryNegationRemoval_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_UseAcronyms_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_YodaPerturbation_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_ProtaugmentDiverseParaphrase_1000_KOF
Processing ReplaceFinancialAmount to save at ./datasets/yelp_polarity_ReplaceFinancialAmount_1000_KOF
setting max_outputs=1


HBox(children=(FloatProgress(value=0.0), HTML(value='')))


'NoneType' object has no attribute 'keys'
Existing dataset found save at ./datasets/yelp_polarity_AmericanizeBritishizeEnglish_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_FillerWordAugmentation_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_SynonymSubstitution_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_RandomUpperPerturbation_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_WeekdayMonthAbbreviation_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_CityNamesTransformation_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_ChangeDateFormat_1000_KOF
Processing TenseTransformation to save at ./datasets/yelp_polarity_TenseTransformation_1000_KOF
initializing class with to_tense='past'
__init__() missing 1 required positional argument: 'to_tense'
Existing dataset found save at ./datasets/yelp_polarity_DyslexiaWordsSwap_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_Abb



setting max_outputs=1
Existing dataset found save at ./datasets/yelp_polarity_UnitConverter_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_SentenceAdjectivesAntonymsSwitch_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_CorrectCommonMisspellings_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_NeuralParaphaserPerturbation_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_SentenceSubjectObjectSwitch_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_MultilingualDictionaryBasedCodeSwitch_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_EnglishInflectionalVariation_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_ReplaceNumericalValues_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_SpeechDisfluencyPerturbation_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_WhitespacePerturbation_1000_KOF
Existing dataset found save at ./datasets/yelp_polarity_

HBox(children=(FloatProgress(value=0.0), HTML(value='')))


Existing dataset found save at ./datasets/dbpedia_14_DiacriticRemoval_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_AzertyQwertyCharsSwap_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_HashtagGeneration_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_SpellingTransformation_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_TokenReplacement_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_SentenceAuxiliaryNegationRemoval_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_UseAcronyms_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_YodaPerturbation_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_ProtaugmentDiverseParaphrase_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_ReplaceFinancialAmount_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_AmericanizeBritishizeEnglish_1000_KOF
Existing dataset found save at ./datasets/dbpedia_14_FillerWordAugmentation_1

### TextAttack

In [None]:
# !pip install textattack --quiet

In [1]:
import textattack

ModuleNotFoundError: No module named 'tensorflow_text'

In [2]:
from textattack.augmentation import (
    WordNetAugmenter,
    EmbeddingAugmenter,
    CharSwapAugmenter,
    EasyDataAugmenter,
    CheckListAugmenter,
    DeletionAugmenter,
    CLAREAugmenter,
    BackTranslationAugmenter,
)

ModuleNotFoundError: No module named 'tensorflow_text'

In [18]:
textattack_transforms = [
    WordNetAugmenter,
    EmbeddingAugmenter,
    CharSwapAugmenter,
    EasyDataAugmenter,
    CheckListAugmenter,
    DeletionAugmenter,
    CLAREAugmenter,
    BackTranslationAugmenter,
]

NameError: name 'WordNetAugmenter' is not defined

In [None]:
t = Transform(WordNetAugmenter)
t.apply("test")

In [None]:
# control variables
dataset_name   = 'imdb'
save_dir       = "./datasets/"
dataset_size   = 1000
batch_size     = 10
keep_originals = False
seed           = 130

# load dataset
dataset = all_datasets[dataset_name]['train'].shuffle(seed=seed)
if dataset_size > 0:
    dataset = dataset.select(range(dataset_size))

# augment
for t in textattack_transforms:
    
    t_name = t.__name__
    save_path = os.path.join(save_dir, 
        f"{dataset_name}_{t_name}{'_' + str(dataset_size) if dataset_size > 0 else ''}_KO{'T' if keep_originals else 'F'}")
    
    if os.path.exists(save_path):
        print(f"Existing dataset found save at {save_path}")
    else:
        print(f"Processing {t_name} to save at {save_path}")
        aug = partial(augment_data, transform=Transform(t), keep_originals=keep_originals)
        aug_dataset = dataset.map(aug, batched=True, batch_size=batch_size)
        aug_dataset.save_to_disk(save_path)

### Sibyl

In [19]:
from sibyl import TRANSFORMATIONS

FileNotFoundError: Could not find module 'C:\Users\Fabrice\anaconda3\envs\dpml\Lib\site-packages\torchaudio\lib\libtorchaudio.pyd' (or one of its dependencies). Try using the full path with constructor syntax.