In [11]:
import random
import string
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from faker import Faker
from typing import List
from nltk.corpus import words

class Perturbations:
    def __init__ (self):
        self.task_to_keys = {
            "cola": ("sentence", None),
            "mnli": ("premise", "hypothesis"),
            "mnli-mm": ("premise", "hypothesis"),
            "ax": ("premise", "hypothesis"),
            "mrpc": ("sentence1", "sentence2"),
            "qnli": ("question", "sentence"),
            "qqp": ("question1", "question2"),
            "rte": ("sentence1", "sentence2"),
            "sst2": ("sentence", None),
            "stsb": ("sentence1", "sentence2"),
            "wnli": ("sentence1", "sentence2"),
        }
        self.word_list = words.words()

    def removeNouns(self, sentence):
        words = nltk.word_tokenize(sentence)
        tagged_words = pos_tag(words)
        filtered_sentence = [word for word, pos in tagged_words if pos != 'NN' and pos != 'NNP' and pos != 'NNS']
        return ' '.join(filtered_sentence)

    def removeVerbs(self, sentence):
        words = nltk.word_tokenize(sentence)
        tagged_words = pos_tag(words)
        filtered_sentence = [word for word, pos in tagged_words if pos != 'VB' and pos != 'VBD' and pos != 'VBG' and pos != 'VBN' and pos != 'VBP' and pos != 'VBZ']
        return ' '.join(filtered_sentence)

    def random_swap_words(self, sentence):
        words = word_tokenize(sentence)
        if len(words) >= 2:
            i, j = random.sample(range(len(words)), 2)
            words[i], words[j] = words[j], words[i]
            return " ".join(words)
        else:
            return sentence
        
    
    def add_random_words(self, sentence):
        words = sentence.split()
        num_words_to_add = int(len(words) / 10)
        for i in range(num_words_to_add):
            insert_index = random.randint(0, len(words) - 1)
            words.insert(insert_index, random.choice(self.word_list))
        return " ".join(words)
    
    
    def replace_characters(self, sentence, probability=0.10):
        new_sentence = ""
        for char in sentence:
            if random.random() < probability:
                char = random.choice(string.ascii_letters)
            new_sentence += char
        return new_sentence

    def changeGender(self, str):
        dictionary = {
        "batman": "batwoman", "batwoman": "batman",
        "boy": "girl", "girl": "boy",
        "boyfriend": "girlfriend", "girlfriend": "boyfriend",
        "father": "mother", "mother": "father",
        "husband": "wife", "wife": "husband",
        "he": "she", "she": "he", "He":"She", "She":"He",
        "his": "her", "her": "his", "His":"Her", "Her":"His",
        "male": "female", "female": "male","him":"her","her":"him",
        "man": "woman", "woman": "man",
        "Mr": "Ms", "Mr": "Ms",
        "sir": "madam", "madam": "sir",
        "son": "daughter", "daughter": "son",
        "uncle": "aunt", "aunt": "uncle",
        "brother": "sister", "sister": "brother",
        "king": "queen", "queen": "king",
        "prince": "princess", "princess": "prince",
        "male": "female", "female": "male",
        "grandfather": "grandmother", "grandmother": "grandfather",
        "nephew": "niece", "niece": "nephew",
        "father-in-law": "mother-in-law", "mother-in-law": "father-in-law",
        "son-in-law": "daughter-in-law", "daughter-in-law": "son-in-law",
        "stepfather": "stepmother", "stepmother": "stepfather",
        "godfather": "godmother", "godmother": "godfather",
        }

        str = str + ' ' # Append a space at the en
        temp = ""
        ans = ""

        for i in range(len(str)):
            if str[i] != ' ':
                temp += str[i]
            else:
                if temp in dictionary:
                    temp = dictionary[temp]

                ans = ans + temp + ' '
                temp = ""
        return ans



    def noNouns(self, task, dataset):
        sentence1Id, sentence2Id = self.task_to_keys[task]
        if sentence2Id is None:
            return [self.removeNouns(sentence)  for sentence in dataset[sentence1Id]], None
        else:
            sentences1 = [self.removeNouns(sentence) for sentence in dataset[sentence1Id]]
            sentences2 = [self.removeNouns(sentence) for sentence in dataset[sentence2Id]]
            return sentences1, sentences2


    def noVerbs(self, task, dataset):
        sentence1Id, sentence2Id = self.task_to_keys[task]
        if sentence2Id is None:
            return [self.removeVerbs(sentence)  for sentence in dataset[sentence1Id]], None
        else:
            sentences1 = [self.removeVerbs(sentence) for sentence in dataset[sentence1Id]]
            sentences2 = [self.removeVerbs(sentence) for sentence in dataset[sentence2Id]]
            return sentences1, sentences2
    

    def noFirst(self, task, dataset):
        sentence1Id, sentence2Id = self.task_to_keys[task]
        if sentence2Id is None:
            return [sentence.split(" ", 1)[1] if " " in sentence else sentence for sentence in dataset[sentence1Id]], None
        else:
            sentences1 = [sentence.split(" ", 1)[1] if " " in sentence else sentence for sentence in dataset[sentence1Id]]
            sentences2 = [sentence.split(" ", 1)[1] if " " in sentence else sentence for sentence in dataset[sentence2Id]]
            return sentences1, sentences2
        

    def noLast(self, task, dataset):
        sentence1Id, sentence2Id = self.task_to_keys[task]
        if sentence2Id is None:
            return [sentence.rsplit(" ", 1)[0] + sentence[-1] for sentence in dataset[sentence1Id]], None
        else: 
            sentences1 = [sentence.rsplit(" ", 1)[0] + sentence[-1] for sentence in dataset[sentence1Id]]
            sentences2 = [sentence.rsplit(" ", 1)[0] + sentence[-1] for sentence in dataset[sentence2Id]]
            return sentences1, sentences2


    def swapText(self, task, dataset):
        sentence1Id, sentence2Id = self.task_to_keys[task]
        if sentence2Id is None:
            return [self.random_swap_words(sentence)  for sentence in dataset[sentence1Id]], None
        else:
            sentences1 = [self.random_swap_words(sentence) for sentence in dataset[sentence1Id]]
            sentences2 = [self.random_swap_words(sentence) for sentence in dataset[sentence2Id]]
            return sentences1, sentences2
        
    
    def addText(self, task, dataset):
        sentence1Id, sentence2Id = self.task_to_keys[task]
        if sentence2Id is None:
            return [self.add_random_words(sentence)  for sentence in dataset[sentence1Id]], None
        else:
            sentences1 = [self.add_random_words(sentence) for sentence in dataset[sentence1Id]]
            sentences2 = [self.add_random_words(sentence) for sentence in dataset[sentence2Id]]
            return sentences1, sentences2
        
        
    def changeChar(self, task, dataset):
        sentence1Id, sentence2Id = self.task_to_keys[task]
        if sentence2Id is None:
            return [self.replace_characters(sentence)  for sentence in dataset[sentence1Id]], None
        else:
            sentences1 = [self.replace_characters(sentence) for sentence in dataset[sentence1Id]]
            sentences2 = [self.replace_characters(sentence) for sentence in dataset[sentence2Id]]
            return sentences1, sentences2
        

    def bias(self, task, dataset):
        sentence1Id, sentence2Id = self.task_to_keys[task]
        if sentence2Id is None:
            return [self.changeGender(sentence)  for sentence in dataset[sentence1Id]], None
        else:
            sentences1 = [self.changeGender(sentence) for sentence in dataset[sentence1Id]]
            sentences2 = [self.changeGender(sentence) for sentence in dataset[sentence2Id]]
            return sentences1, sentences2


In [12]:
from datasets import Dataset

class DatasetClass:
    def __init__ (self):
        pass
    def getDataset(self,task,  sentences1, sentences2, labels):
        task_to_keys = {
            "cola": ("sentence", None),
            "mnli": ("premise", "hypothesis"),
            "mnli-mm": ("premise", "hypothesis"),
            "ax":("premise", "hypothesis"),
            "mrpc": ("sentence1", "sentence2"),
            "qnli": ("question", "sentence"),
            "qqp": ("question1", "question2"),
            "rte": ("sentence1", "sentence2"),
            "sst2": ("sentence", None),
            "stsb": ("sentence1", "sentence2"),
            "wnli": ("sentence1", "sentence2"),
        }
        sentence1Id, sentence2Id = task_to_keys[task]
        if sentence2Id is None:
            return Dataset.from_dict({sentence1Id:sentences1, 'label':labels})
        else:
            return Dataset.from_dict({sentence1Id:sentences1, sentence2Id:sentences2, 'label':labels})



In [None]:
from datasets import load_dataset
tasks = [  'cola', 'sst2', 'mrpc', 'stsb', 'qqp','mnli', 'qnli', 'rte', 'wnli']
pertNames = ["noNouns", "noVerbs", "noFirst", "noLast", "swapText", "addText", "changeChar", "bias"]
pertClass = Perturbations()

datasetClass = DatasetClass()
for task in tasks:
    dataset = load_dataset('glue', task)
    for pert in pertNames:
        if task=="mnli":
            trainS1, trainS2 = getattr(pertClass, pert)(task, dataset['train'])
            validationmS1, validationmS2 = getattr(pertClass, pert)(task, dataset['validation_matched'])
            validationmmS1, validationmmS2 = getattr(pertClass, pert)(task, dataset['validation_mismatched'])
            testmS1, testmS2 = getattr(pertClass, pert)(task, dataset['test_matched'])
            testmmS1, testmmS2 = getattr(pertClass, pert)(task, dataset['test_mismatched'])
            trainDs =  datasetClass.getDataset(task, trainS1, trainS2, dataset['train']['label'])
            validationmDs =  datasetClass.getDataset(task, validationmS1, validationmS2, dataset['validation_matched']['label'])
            validationmmDs =  datasetClass.getDataset(task, validationmmS1, validationmmS2, dataset['validation_mismatched']['label'])
            testmDs =  datasetClass.getDataset(task, testmS1, testmS2, dataset['test_matched']['label'])
            testmmDs =  datasetClass.getDataset(task, testmmS1, testmmS2, dataset['test_mismatched']['label'])
            trainDs.save_to_disk(task+'train'+pert)
            validationmDs.save_to_disk(task+'validationm'+pert)
            validationmmDs.save_to_disk(task+'validationmm'+pert)
            testmDs.save_to_disk(task+'testm'+pert)
            testmmDs.save_to_disk(task+'testmm'+pert)
        else:
            trainS1, trainS2 = getattr(pertClass, pert)(task, dataset['train'])
            validationS1, validationS2 = getattr(pertClass, pert)(task, dataset['validation'])
            testS1, testS2 = getattr(pertClass, pert)(task, dataset['test'])
            trainDs = datasetClass.getDataset(task, trainS1, trainS2, dataset['train']['label'])
            validationDs = datasetClass.getDataset(task, validationS1, validationS2 , dataset['validation']['label'])
            testDs =  datasetClass.getDataset(task, testS1, testS2, dataset['test']['label'])
            trainDs.save_to_disk(task+'train' + pert)
            validationDs.save_to_disk(task+'validation'+pert)
            testDs.save_to_disk(task+'test'+pert)
    

In [13]:
from datasets import load_dataset
tasks = [ 'ax']
pertNames = ["noNouns", "noVerbs", "noFirst", "noLast", "swapText", "addText", "changeChar", "bias"]
pertClass = Perturbations()

datasetClass = DatasetClass()
for task in tasks:
    dataset = load_dataset('glue', task)
    for pert in pertNames:
        testS1, testS2 = getattr(pertClass, pert)(task, dataset['test'])
        testDs =  datasetClass.getDataset(task, testS1, testS2, dataset['test']['label'])
        testDs.save_to_disk(task+'test'+pert)
        

Found cached dataset glue (C:/Users/NIT/.cache/huggingface/datasets/glue/ax/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/1 [00:00<?, ?it/s]

Saving the dataset (0/1 shards):   0%|          | 0/1104 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1104 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1104 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1104 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1104 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1104 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1104 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1104 [00:00<?, ? examples/s]