In [20]:
from textattack.transformations import WordSwapRandomCharacterDeletion
from textattack.transformations import CompositeTransformation
from textattack.augmentation import Augmenter
transformation = CompositeTransformation([WordSwapRandomCharacterDeletion()])
augmenter = Augmenter(transformation=transformation, transformations_per_example=5)
s = 'What is asthma?'
augmenter.augment(s)

['Wha is asthma?',
 'What i asthma?',
 'What is astha?',
 'What s asthma?',
 'Wht is asthma?']

In [21]:
from textattack.augmentation import EasyDataAugmenter
augmenter = EasyDataAugmenter()
q = 'What is asthma.'
iteration = [1,2,3,4,5,6,7,8,9,10]
for x in iteration:
    print(augmenter.augment(q))

[nltk_data] Downloading package omw-1.4 to /Users/yangl/nltk_data...


['What is comprise asthma.', 'asthma is What.', 'is asthma.', 'What is asthma.']
['follow What is asthma.', 'What is asthma.', 'asthma is What.', 'What asthma.']
['is What asthma.', 'What is exist asthma.', 'What is asthma.', 'What is.']
['is asthma.', 'What asthma is.', 'exist What is asthma.', 'What is asthma.']
['comprise What is asthma.', 'What is asthma.', 'What is.', 'is What asthma.']
['What is be asthma.', 'asthma is What.', 'What is.', 'What is asthma.']
['What is live asthma.', 'asthma is What.', 'What is asthma.', 'is asthma.']
['embody What is asthma.', 'What is asthma.', 'is What asthma.', 'What asthma.']
['What is asthma.', 'What asthma is.', 'What is cost asthma.', 'is asthma.']
['is What asthma.', 'What is.', 'What is asthma.', 'What be is asthma.']


In [104]:
import json
from textattack.augmentation import CheckListAugmenter
from textattack.augmentation import WordNetAugmenter

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Initialize the ChecklistAugmenter
augmenter = CheckListAugmenter()

augmenter.add_transformation(WordNetAugmenter())


# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            augmented_questions = augmenter.augment(original_question)

            augmented_qa = {
                'question': original_question,
                'augmented_questions': augmented_questions,
                'answers': [{'text': answer}]
            }
            augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)

# Save the augmented data to a new JSON file
with open('/Users/yangl/Downloads/augmented_data_3.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)

AttributeError: 'CheckListAugmenter' object has no attribute 'add_transformation'

In [107]:
# Synonym Replacement


import json
import nlpaug.augmenter.word as naw

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Apply Synonym Replacement using nlpaug
            augmenter = naw.SynonymAug(aug_src='wordnet')
            augmented_questions = augmenter.augment(original_question, n=5)

            augmented_qa = {
                'question': original_question,
                'augmented_questions': augmented_questions,
                'answers': [{'text': answer}]
            }
            augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)
    
    with open('/Users/yangl/Downloads/augmented_data_3.json', 'w') as file:
        json.dump(augmented_data, file, indent=4)




In [125]:
# random deletion 

import json
import nltk
import random

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        augmented_questions_set = set()  # To store unique augmented questions
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Tokenize the original question
            tokens = nltk.word_tokenize(original_question)

            # Augment the question by randomly deleting a word
            augmented_tokens = tokens.copy()
            if len(tokens) > 1:
                word_index = random.randint(0, len(tokens) - 1)
                original_word = augmented_tokens[word_index]

                # Get the part of speech of the word
                pos = nltk.pos_tag(tokens)[word_index][1]

                # Get synsets (related words) for the selected word
                synsets = wordnet.synsets(original_word, pos=pos)

                if synsets:
                    # Get the first synset (related word)
                    related_word = synsets[0].lemmas()[0].name()

                    # Replace the selected word with the related word
                    augmented_tokens[word_index] = related_word

            # Reconstruct the augmented question
            augmented_question = ' '.join(augmented_tokens)

            if augmented_question and augmented_question not in augmented_questions_set:
                augmented_questions_set.add(augmented_question)

                augmented_qa = {
                    'question': original_question,
                    'augmented_questions': [augmented_question],
                    'answers': [{'text': answer}]
                }
                augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)

# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_5.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)



KeyError: '.'

In [None]:
#final random word insertion

import json
import nltk
from nltk.corpus import wordnet
import random

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Tokenize the original question
            tokens = nltk.word_tokenize(original_question)

            # Augment the question by replacing a random word with a synonym
            augmented_tokens = tokens.copy()
            replaced_words = set()  # Set to store replaced words
            for i, word in enumerate(tokens):
                synsets = wordnet.synsets(word)
                if synsets:
                    synonyms = []
                    for synset in synsets:
                        for lemma in synset.lemmas():
                            synonyms.append(lemma.name())
                    if synonyms:
                        new_word = random.choice(synonyms)
                        if new_word not in replaced_words:  # Avoid duplicates
                            augmented_tokens[i] = new_word
                            replaced_words.add(new_word)

            # Reconstruct the augmented question
            augmented_question = ' '.join(augmented_tokens)

            augmented_qa = {
                'question': original_question,
                'augmented_questions': [augmented_question],
                'answers': [{'text': answer}]
            }
            augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)
# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_4.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)


In [1]:
import json
import random
import nltk
from transformers import MarianMTModel, MarianTokenizer


# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)


# Initialize the back translation models and tokenizers
src_lang = 'en'
target_lang = 'fr'  # Specify the target language for back translation
model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{target_lang}'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Perform back translation
            inputs = tokenizer(original_question, truncation=True, padding='longest', return_tensors='pt')
            translated = model.generate(**inputs, max_length=128, num_beams=5, num_return_sequences=1)
            back_translated = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
            augmented_questions = [original_question] + back_translated

            augmented_qa = {
                'question': original_question,
                'augmented_questions': augmented_questions,
                'answers': [{'text': answer}]
            }
            augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)


# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_6.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)


  from .autonotebook import tqdm as notebook_tqdm
