In [1]:
#Synonym Replacement
import nltk
from nltk.corpus import wordnet
import random

nltk.download("wordnet")

def synonym_replacement(sentence, n=2):
    words = sentence.split()
    new_words = words.copy()
    random.shuffle(new_words)
    for word in new_words:
        synonyms = wordnet.synsets(word)
        if synonyms:
            synonym = synonyms[0].lemmas()[0].name()
            sentence = sentence.replace(word, synonym, 1)
            n -= 1
        if n <= 0:
            break
    return sentence

# Example sentence
sentence = "The girl is walking on the street."
print("Original:", sentence)
print("Augmented:", synonym_replacement(sentence))


[nltk_data] Downloading package wordnet to /root/nltk_data...


Original: The girl is walking on the street.
Augmented: The girl is walk on the street.


In [2]:
#Random Insertion and Deletion
import random

def random_insertion(sentence, word_list, n=1):
    words = sentence.split()
    for _ in range(n):
        random_word = random.choice(word_list)
        random_position = random.randint(0, len(words))
        words.insert(random_position, random_word)
    return " ".join(words)

def random_deletion(sentence, p=0.2):
    words = sentence.split()
    if len(words) == 1:
        return sentence
    return " ".join([word for word in words if random.random() > p])

# Example usage
sentence = "The cat sat on the mat."
print("Insertion:", random_insertion(sentence, ["fluffy", "tiny"]))
print("Deletion:", random_deletion(sentence))


Insertion: The cat sat on the mat. fluffy
Deletion: The cat sat on the mat.


In [3]:
#Back translation
#This technique involves translating a sentence to another language and then translating it back to the original language

from transformers import MarianMTModel, MarianTokenizer

# Load translation model
model_name = "Helsinki-NLP/opus-mt-en-de"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def back_translate(sentence):
    # English to German
    tokens = tokenizer(sentence, return_tensors="pt", padding=True)
    translated = model.generate(**tokens)
    german = tokenizer.decode(translated[0], skip_special_tokens=True)

    # German back to English
    model_name_back = "Helsinki-NLP/opus-mt-de-en"
    tokenizer_back = MarianTokenizer.from_pretrained(model_name_back)
    model_back = MarianMTModel.from_pretrained(model_name_back)
    tokens_back = tokenizer_back(german, return_tensors="pt", padding=True)
    back_translated = model_back.generate(**tokens_back)
    return tokenizer_back.decode(back_translated[0], skip_special_tokens=True)

# Example sentence
sentence = "The boy came from the school."
print("Original:", sentence)
print("Back-translated:", back_translate(sentence))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Original: The boy came from the school.


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Back-translated: The boy came from school.


In [5]:
#Noise injection - This involves adding small errors, such as typos or misspellings"
import random

def noise_injection(sentence, p=0.2):
    words = list(sentence)
    noisy_sentence = []
    for word in words:
        if random.random() < p:
            noisy_sentence.append(word + random.choice("abcdefghijklmnopqrstuvwxyz"))
        else:
            noisy_sentence.append(word)
    return "".join(noisy_sentence)

# Example usage
sentence = "The cat sat on the mat."
print("Original:", sentence)
print("Noisy version:", noise_injection(sentence))


Original: The cat sat on the mat.
Noisy version: The ctat sath on the mat.r


In [1]:
#Contextual Embedding-Based Augmentation (EDA)
import random
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForMaskedLM.from_pretrained("bert-base-uncased")

def contextual_embedding_augmentation(sentence, num_replacements=1):
    words = sentence.split()
    # Select random words for replacement
    random_word_indices = random.sample(range(len(words)), num_replacements)

    for index in random_word_indices:
        original_word = words[index]

        # Mask the word to be replaced
        tokens = tokenizer(sentence, return_tensors="pt")
        tokens["input_ids"][0, index + 1] = tokenizer.mask_token_id  # Mask the selected word

        # Predict replacements for the masked word
        with torch.no_grad():
            outputs = model(**tokens)
            predictions = outputs.logits

        # Get the top prediction for the masked token
        mask_token_index = torch.where(tokens["input_ids"] == tokenizer.mask_token_id)[1]
        predicted_token_id = predictions[0, mask_token_index].topk(1).indices[0].item()
        predicted_word = tokenizer.decode([predicted_token_id])

        # Replace the original word with the predicted word
        words[index] = predicted_word

    return " ".join(words)

# Example sentence
sentence = "The cat sat on the mat."
print("Original:", sentence)
print("Augmented:", contextual_embedding_augmentation(sentence))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Original: The cat sat on the mat.
Augmented: The cat sat on the floor
