In [8]:
import pandas as pd
import nlpaug.augmenter.word as naw

In [9]:
text_df = pd.read_csv('commands_text.txt', sep='\n')
text_df.columns = ["Original Text"]
table_df = text_df
# Convert df to list
orig_text_list = text_df['Original Text'].tolist()


The following models would be utilized for generating variants of the same set of sentences:

* BERT (word substitute)
* BERT (word insertion)
* word2vec (word substitute)
* word2vec (word insert)
* Wordnet synonym (word substitute)

In [10]:
bert_subs_aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
bert_subs_augmented_text = bert_subs_aug.augment(orig_text_list)

bert_insert_aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")
bert_insert_augmented_text = bert_insert_aug.augment(orig_text_list)

word2vec_subs_aug = naw.WordEmbsAug(model_type='word2vec', model_path='./GoogleNews-vectors-negative300.bin', action="substitute")
word2vec_subs_augmented_text = word2vec_subs_aug.augment(orig_text_list)

word2vec_insert_aug = naw.WordEmbsAug(model_type='word2vec', model_path='./GoogleNews-vectors-negative300.bin', action="insert")
word2vec_insert_augmented_text = word2vec_insert_aug.augment(orig_text_list) 

wordnet_subs_aug = naw.SynonymAug(aug_src='wordnet')
wordnet_augmented_text = wordnet_subs_aug.augment(orig_text_list)

In [None]:
back_translation_fr_aug = naw.BackTranslationAug(
    from_model_name='transformer.wmt19.en-fr', 
    to_model_name='transformer.wmt19.fr-en'
)
back_translation_fr_aug.augment(orig_text_list)

In [11]:
table_df['BERT (substitue)'] = bert_subs_augmented_text
table_df['BERT (insert)'] = bert_insert_augmented_text
table_df['word2vec (substitue)'] = word2vec_subs_augmented_text
table_df['word2vec (insert)'] = word2vec_insert_augmented_text
table_df['wordnet (insert)'] = wordnet_augmented_text

# Save the table to CSV file
table_df.to_csv('outputs.csv')

Models NOT explored:

* TF-IDF similarity (insert/substitute)
* PPDB synonym (substitute)

Model worth exploring:

* Back translation
    * Ex: English -> French -> English 