## Natural Language Toolkit (NLTK) 



### Synonym Replacement:
 Swap out words with their synonyms. 

Original: "The movie was fantastic and very enjoyable."

Augmented: "The film was terrific and very pleasant."

In [1]:
import nltk
from nltk.corpus import wordnet
import random

In [3]:
#!pip install nltk


# need to download these resources first
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')

In [7]:
def synonym_replacement(sentence, n=1):
    """Replace n words in a sentence with their synonyms."""
    words = sentence.split()
    new_words = words.copy()
    
    # Get words that have synonyms
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = set()
        for syn in wordnet.synsets(random_word):
            for lemma in syn.lemmas():
                # Avoid synonyms with underscores (e.g., 'big_cat')
                if "_" not in lemma.name():
                    synonyms.add(lemma.name())
        
        if len(synonyms) > 1:
            # Replace the first occurrence of the word
            for i in range(len(new_words)):
                if new_words[i] == random_word:
                    new_words[i] = random.choice(list(synonyms))
                    num_replaced += 1
                    break # Stop after one replacement for this synonym
        
        if num_replaced >= n:
            break

    return " ".join(new_words)

In [8]:
original_text = "The quick brown fox jumps over the lazy dog"
augmented_text = synonym_replacement(original_text, n=2)

print(f"Original: {original_text}")
print(f"Augmented: {augmented_text}")

Original: The quick brown fox jumps over the lazy dog
Augmented: The quick brown fob jumps ended the lazy dog


### Back Translation
Translate the text to another language and then translate it back to the original.

Original (English): "I need to catch my flight soon."

Translate to French: "Je dois prendre mon vol bient√¥t."

Translate back to English: "I have to take my flight soon."

# pip install transformers torch sentencepiece

In [1]:
from transformers import pipeline

def back_translate(text, source_lang='en', intermediate_lang='fr'):
    """
    Translates text to an intermediate language and back to the source language.
    """

    en_to_fr_translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-{source_lang}-{intermediate_lang}")
    fr_to_en_translator = pipeline("translation", model=f"Helsinki-NLP/opus-mt-{intermediate_lang}-{source_lang}")

    # Translate to the intermediate language
    intermediate_translation = en_to_fr_translator(text, max_length=512)
    intermediate_text = intermediate_translation[0]['translation_text']
    
    # Translate back to the source language
    back_translation = fr_to_en_translator(intermediate_text, max_length=512)
    augmented_text = back_translation[0]['translation_text']
    
    return augmented_text



In [2]:
original_text = "The new regulations will have a significant impact on the industry."

augmented_text = back_translate(original_text)

print(f"Original: {original_text}")
print(f"Augmented: {augmented_text}")

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use mps:0


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

Device set to use mps:0


Original: The new regulations will have a significant impact on the industry.
Augmented: The new regulations will have a significant impact on industry.


### Random Insertion
Insert a random synonym of a word at a random position in the sentence.

Original: "The cat sat on the mat."

Augmented: "The furry cat sat on the mat."

In [5]:
import nltk
from nltk.corpus import wordnet
import random

In [3]:
def get_synonyms(word):
    """Gets synonyms for a word using WordNet."""
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            # Avoid synonyms with underscores (e.g., 'big_cat')
            if "_" not in lemma.name():
                synonyms.add(lemma.name())
    return list(synonyms)


def random_insertion(sentence, n=1):
    """
    Randomly inserts n synonyms of existing words into the sentence.
    """
    words = sentence.split()
    new_words = words.copy()
    
    for _ in range(n):
        # Find words that have synonyms
        words_with_synonyms = [word for word in words if len(get_synonyms(word)) > 0]
        if not words_with_synonyms:
            return " ".join(new_words) # No words to add synonyms for
        
        # Pick a random word and its synonym
        random_word = random.choice(words_with_synonyms)
        random_synonym = random.choice(get_synonyms(random_word))
        
        # Insert the synonym at a random position
        random_idx = random.randint(0, len(new_words))
        new_words.insert(random_idx, random_synonym)

    return " ".join(new_words)




In [6]:
original_text = "The quick brown fox jumps over the lazy dog"

augmented_text = random_insertion(original_text, n=2)

print(f"Original: {original_text}")
print(f"Augmented: {augmented_text}")

Original: The quick brown fox jumps over the lazy dog
Augmented: The quick spry brown brownness fox jumps over the lazy dog


### Random Deletion
Randomly remove words from the sentence. This forces the model to learn from incomplete sentences.

Original: "The cat sat on the brown mat."

Augmented: "The cat sat on the mat."

In [9]:
def random_deletion(sentence, p=0.1):
    """
    Randomly deletes words in a sentence with probability p.
    """
    words = sentence.split()
    
    # Don't delete if the sentence is too short
    if len(words) <= 2:
        return sentence

    # Create a new list of words, keeping each word with probability (1-p)
    remaining_words = [word for word in words if random.random() > p]
    
    # If all words were deleted, return at least one random word
    if len(remaining_words) == 0:
        return random.choice(words)

    return " ".join(remaining_words)


In [10]:
original_text = "The quick brown fox jumps over the lazy dog"

augmented_text = random_deletion(original_text, p=0.2)

print(f"Original: {original_text}")
print(f"Augmented: {augmented_text}")

Original: The quick brown fox jumps over the lazy dog
Augmented: The quick brown fox jumps over lazy dog
