In [None]:
# Synonym Replacement

import json
import nlpaug.augmenter.word as naw

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Apply Synonym Replacement using nlpaug
            augmenter = naw.SynonymAug(aug_src='wordnet')
            augmented_questions = augmenter.augment(original_question, n=5)

            augmented_qa = {
                'question': original_question,
                'augmented_questions': augmented_questions,
                'answers': [{'text': answer}]
            }
            augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)
    
    with open('/Users/yangl/Downloads/augmented_data_3.json', 'w') as file:
        json.dump(augmented_data, file, indent=4)


In [None]:
# synonym replcement with sentense shuffling 
import json
from transformers import BertTokenizer, BertForSequenceClassification
import nlpaug.augmenter.word as naw

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Apply Synonym Replacement using nlpaug
            augmenter_question = naw.SynonymAug(aug_src='wordnet')
            augmented_questions = augmenter_question.augment(original_question, n=5)

            # Augment the answer using Sentence Shuffling
            augmenter_answer = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action='substitute')
            augmented_answer = augmenter_answer.augment(answer)

            # Check answer coherence
            def check_answer_coherence(answer, context):
                text = f"{answer} {context}"
                inputs = tokenizer(text, truncation=True, padding=True, return_tensors='pt')
                outputs = model(**inputs)
                predicted_label = outputs.logits.argmax().item()
                return predicted_label

            coherence_label = check_answer_coherence(augmented_answer, paragraph['context'])
            if coherence_label == 1:
                augmented_qa = {
                    'question': original_question,
                    'augmented_questions': augmented_questions,
                    'answers': [{'text': augmented_answer}]
                }
                augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)

# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_3.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)

In [None]:
# random deletion 

import json
import nltk
import random

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        augmented_questions_set = set()  # To store unique augmented questions
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Tokenize the original question
            tokens = nltk.word_tokenize(original_question)

            # Augment the question by randomly deleting a word
            augmented_tokens = tokens.copy()
            if len(tokens) > 1:
                # Randomly determine the number of words to delete (up to a maximum of half the words)
                num_deletions = random.randint(1, min(len(tokens) // 2, 5))
                for _ in range(num_deletions):
                    if augmented_tokens:
                        word_index = random.randint(0, len(augmented_tokens) - 1)
                        augmented_tokens.pop(word_index)

            # Reconstruct the augmented question
            augmented_question = ' '.join(augmented_tokens)

            if augmented_question and augmented_question not in augmented_questions_set:
                augmented_questions_set.add(augmented_question)

                augmented_qa = {
                    'question': original_question,
                    'augmented_questions': [augmented_question],
                    'answers': [{'text': answer}]
                }
                augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)

# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_5.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)

In [None]:
#Random deletion with sentence shuffling
import json
import nltk
import random

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        augmented_questions_set = set()  # To store unique augmented questions
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Tokenize the original question
            tokens = nltk.word_tokenize(original_question)

            # Augment the question by randomly deleting a word
            augmented_tokens = tokens.copy()
            if len(tokens) > 1:
                # Randomly determine the number of words to delete (up to a maximum of half the words)
                num_deletions = random.randint(1, min(len(tokens) // 2, 5))
                for _ in range(num_deletions):
                    if augmented_tokens:
                        word_index = random.randint(0, len(augmented_tokens) - 1)
                        augmented_tokens.pop(word_index)

            # Reconstruct the augmented question
            augmented_question = ' '.join(augmented_tokens)

            if augmented_question and augmented_question not in augmented_questions_set:
                augmented_questions_set.add(augmented_question)

                # Augment the answer by shuffling the words
                answer_tokens = nltk.word_tokenize(answer)
                random.shuffle(answer_tokens)
                augmented_answer = ' '.join(answer_tokens)

                # Check if the augmented answer makes sense
                if nltk.word_tokenize(augmented_answer) != answer_tokens:
                    # Augmented answer does not make sense, skip this augmentation
                    continue

                augmented_qa = {
                    'question': original_question,
                    'augmented_questions': [augmented_question],
                    'answers': [{'text': augmented_answer}]
                }
                augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)

# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_5.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)

In [None]:
#random word insertion

import json
import nltk
from nltk.corpus import wordnet
import random

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Tokenize the original question
            tokens = nltk.word_tokenize(original_question)

            # Augment the question by replacing a random word with a synonym
            augmented_tokens = tokens.copy()
            replaced_words = set()  # Set to store replaced words
            for i, word in enumerate(tokens):
                synsets = wordnet.synsets(word)
                if synsets:
                    synonyms = []
                    for synset in synsets:
                        for lemma in synset.lemmas():
                            synonyms.append(lemma.name())
                    if synonyms:
                        new_word = random.choice(synonyms)
                        if new_word not in replaced_words:  # Avoid duplicates
                            augmented_tokens[i] = new_word
                            replaced_words.add(new_word)

            # Reconstruct the augmented question
            augmented_question = ' '.join(augmented_tokens)

            augmented_qa = {
                'question': original_question,
                'augmented_questions': [augmented_question],
                'answers': [{'text': answer}]
            }
            augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)
# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_4.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)


In [None]:
#Random word insertion and sentense shuffling 
import json
import nltk
from nltk.corpus import wordnet
import random

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Tokenize the original question
            question_tokens = nltk.word_tokenize(original_question)

            # Augment the question by replacing a random word with a synonym
            augmented_question_tokens = question_tokens.copy()
            replaced_words = set()  # Set to store replaced words
            for i, word in enumerate(question_tokens):
                synsets = wordnet.synsets(word)
                if synsets:
                    synonyms = []
                    for synset in synsets:
                        for lemma in synset.lemmas():
                            synonyms.append(lemma.name())
                    if synonyms:
                        new_word = random.choice(synonyms)
                        if new_word not in replaced_words:  # Avoid duplicates
                            augmented_question_tokens.insert(i, new_word)
                            replaced_words.add(new_word)

            # Reconstruct the augmented question
            augmented_question = ' '.join(augmented_question_tokens)

            # Shuffle the words in the answer sentence
            answer_tokens = nltk.word_tokenize(answer)
            random.shuffle(answer_tokens)
            augmented_answer = ' '.join(answer_tokens)

            # Check if the augmented question and answer make sense
            if nltk.word_tokenize(augmented_question) != augmented_question_tokens or \
                    nltk.word_tokenize(augmented_answer) != answer_tokens:
                # Augmented question or answer does not make sense, skip this augmentation
                continue

            augmented_qa = {
                'question': original_question,
                'augmented_questions': [augmented_question],
                'answers': [{'text': augmented_answer}]
            }
            augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)

# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_4.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)

In [None]:
#randon swap 
import json
import random

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        augmented_questions_set = set()  # To store unique augmented questions
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Tokenize the original question
            tokens = original_question.split()

            # Augment the question by randomly swapping adjacent words
            augmented_questions = set()
            while len(augmented_questions) < 1:  # Generate at most 1 unique augmented question
                augmented_tokens = tokens.copy()
                if len(tokens) > 1:
                    for i in range(len(tokens) - 1):
                        if random.random() < 0.2:  # 20% chance of swapping adjacent words
                            augmented_tokens[i], augmented_tokens[i+1] = augmented_tokens[i+1], augmented_tokens[i]

                # Reconstruct the augmented question
                augmented_question = ' '.join(augmented_tokens)

                if augmented_question != original_question and augmented_question not in augmented_questions_set:
                    augmented_questions.add(augmented_question)
                    augmented_questions_set.add(augmented_question)

            if augmented_questions:
                augmented_question = augmented_questions.pop()

                augmented_qa = {
                    'question': original_question,
                    'augmented_questions': [augmented_question],
                    'answers': [{'text': answer}]
                }
                augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)
    
# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_8.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)

In [None]:
#Random swap and sentence shuffling for answer
import json
import random

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        augmented_questions_set = set()  # To store unique augmented questions
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Tokenize the original question
            tokens = original_question.split()

            # Augment the question by randomly swapping adjacent words
            augmented_questions = set()
            while len(augmented_questions) < 1:  # Generate at most 1 unique augmented question
                augmented_tokens = tokens.copy()
                if len(tokens) > 1:
                    for i in range(len(tokens) - 1):
                        if random.random() < 0.2:  # 20% chance of swapping adjacent words
                            augmented_tokens[i], augmented_tokens[i+1] = augmented_tokens[i+1], augmented_tokens[i]

                # Reconstruct the augmented question
                augmented_question = ' '.join(augmented_tokens)

                if augmented_question != original_question and augmented_question not in augmented_questions_set:
                    augmented_questions.add(augmented_question)
                    augmented_questions_set.add(augmented_question)

            if augmented_questions:
                augmented_question = augmented_questions.pop()

                # Shuffle the words in the answer sentence
                answer_tokens = answer.split()
                random.shuffle(answer_tokens)
                augmented_answer = ' '.join(answer_tokens)

                # Check if the augmented question and answer make sense
                if augmented_question != original_question and augmented_answer != answer:
                    augmented_qa = {
                        'question': original_question,
                        'augmented_questions': [augmented_question],
                        'answers': [{'text': augmented_answer}]
                    }
                    augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)
    
# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_8.json', 'w') as file:
    json.dump(augmented_data, file, indent=4)

In [None]:
# Back translation draft

import json
from transformers import MarianMTModel, MarianTokenizer

# Read the input data from a JSON file
with open('/Users/yangl/Downloads/answers.json', 'r') as file:
    data = json.load(file)

# Initialize the back translation models and tokenizers
src_lang = 'en'
target_lang = 'fr'  # Specify the target language for back translation
model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{target_lang}'
model = MarianMTModel.from_pretrained(model_name)
tokenizer = MarianTokenizer.from_pretrained(model_name)

# Perform data augmentation for each entry
augmented_data = []
for entry in data['data']:
    paragraphs = entry['paragraphs']
    augmented_paragraphs = []
    for paragraph in paragraphs:
        qas = paragraph['qas']
        augmented_qas = []
        for qa in qas:
            original_question = qa['question']
            answer = qa['answers'][0]['text']

            # Forward translation: English to Target Language
            encoded_input = tokenizer.encode(original_question, return_tensors='pt')
            inputs = {
                'input_ids': encoded_input.to(model.device),
                'attention_mask': encoded_input.to(model.device),
            }
            translated = model.generate(**inputs, max_length=128, num_beams=5, num_return_sequences=1)
            translated_question = tokenizer.decode(translated[0], skip_special_tokens=True)

            # Back translation: Target Language to English
            back_model_name = f'Helsinki-NLP/opus-mt-{target_lang}-{src_lang}'
            back_model = MarianMTModel.from_pretrained(back_model_name)
            back_tokenizer = MarianTokenizer.from_pretrained(back_model_name)

            encoded_input = back_tokenizer.encode(translated_question, return_tensors='pt')
            inputs = {
                'input_ids': encoded_input.to(back_model.device),
                'attention_mask': encoded_input.to(back_model.device),
            }
            back_translated = back_model.generate(**inputs, max_length=128, num_beams=5, num_return_sequences=1)
            back_translated_question = back_tokenizer.decode(back_translated[0], skip_special_tokens=True)

            augmented_questions = [original_question, back_translated_question]

            augmented_qa = {
                'question': original_question,
                'augmented_questions': augmented_questions,
                'answers': [{'text': answer}]
            }
            augmented_qas.append(augmented_qa)

        augmented_paragraph = {
            'qas': augmented_qas,
            'context': paragraph['context'],
            'document_id': paragraph['document_id']
        }
        augmented_paragraphs.append(augmented_paragraph)

    augmented_entry = {'paragraphs': augmented_paragraphs}
    augmented_data.append(augmented_entry)

# Save the augmented data to a JSON file
with open('/Users/yangl/Downloads/augmented_data_6.json', 'w', encoding='utf-8') as file:
    json.dump(augmented_data, file, indent=4, ensure_ascii=False)