In [None]:
!pip3 install nltk==3.6.5
!pip install transformers
!pip install sentence_transformers
!pip install datasets
!pip install happytransformer

In [None]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, util
from happytransformer import HappyTextToText, TTSettings

import re
import random
from collections import Counter
import spacy
import json

In [None]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")
spacy_model = spacy.load("en_core_web_sm")

In [None]:
dataset = load_dataset("tommasobonomo/sem_augmented_fever_nli")

In [None]:
def get_synonym(synset):
    synonyms = synset.lemmas()
    return synonyms[1].name() if len(synonyms) > 1 and ('_' not in synonyms[1].name()) else None  # use synonyms[1] cause synonyms[0] usually is the original word

def get_hypernym(synset):
    hypernyms = synset.hypernyms()
    return hypernyms[0].lemmas()[0].name() if hypernyms and ('_' not in hypernyms[0].lemmas()[0].name()) else None

def get_hyponym(synset):
    hyponyms = synset.hyponyms()
    return hyponyms[0].lemmas()[0].name() if hyponyms and ('_' not in hyponyms[0].lemmas()[0].name()) else None

def get_meronym(synset):
    meronyms = synset.part_meronyms()
    return meronyms[0].lemmas()[0].name() if meronyms and ('_' not in meronyms[0].lemmas()[0].name()) else None

def get_antonym(synset):
    for lemma in synset.lemmas():
        antonyms = lemma.antonyms()
        return antonyms[0].name() if antonyms else None
    return None

def augment_sentence(sentence_unit, sentence, typeOfAug, POS):
    '''
        this function substitute one word of 'sentence' applying an augmentation.
        typeOfAug specifies the type of augmentation to perform (get_synonym, get_hypernym, get_hyponym, get_meronym)
        POS is the element to augment ('NOUN', 'VERB', 'ADJ', ...)
    '''
    ret = sentence
    for annotation in sentence_unit['wsd']['hypothesis']:
        if annotation['pos'] == POS:
            if annotation['wnSynsetOffset'] != 'O' and annotation['text'] in sentence and not annotation['text'].istitle(): # to avoid proper nouns
                synset = wn.synset_from_pos_and_offset(annotation['wnSynsetOffset'][-1], int(annotation['wnSynsetOffset'][:-1]))
                changed_word = typeOfAug(synset)
                if changed_word:
                    ret = sentence.replace(annotation['text'], changed_word)
                    break
    return ret

In [None]:
def test_augment():
    d = {0: 'syn', 1: 'hyper', 2: 'hypo', 3: 'mer'}
    for i, example in enumerate(dataset['train']):
        r = random.randint(0, 3)
        if r == 0:
            s = augment_sentence(example, example['hypothesis'], get_synonym, 'NOUN') # VERB, ADJ, ADV
        elif r == 1:
            s = augment_sentence(example, example['hypothesis'], get_hypernym, 'NOUN') # ADJ
        elif r == 2:
            s = augment_sentence(example, example['hypothesis'], get_hyponym, 'NOUN') # ADJ
        elif r == 3:
            s = augment_sentence(example, example['hypothesis'], get_meronym, 'NOUN')
        if s != example['hypothesis']:
            print(d[r])
            print(example['hypothesis'])
            print(s)
            print('#####')
# test_augment()

In [None]:
def isVerb(example, idx, pos_tag):
    '''
    This function takes the starting dataset sample,
    the idx (position) of the word th check
    the pos_tag (part of speech) of the sentence
    and returns True if the word is a verb.
    '''
    if len(example['srl']['hypothesis']['tokens']) <= idx:
        return False
    text = example['srl']['hypothesis']['tokens'][idx]['rawText']
    for elem in pos_tag:
        if elem[0] == text:
            if elem[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
                return True
    return False

def get_verb_index(example):
    '''
    The function returns the indices of the verbs inside the sentence.
    '''
    ret = []
    pos_tag = nltk.pos_tag(nltk.word_tokenize(example['hypothesis']))

    for v in example['srl']['hypothesis']['annotations']:
      verb_indices = []
      pred = 0
      for j, elem in enumerate(v['verbatlas']['roles']):
          for i in range(pred, elem['span'][0]):
              if isVerb(example, i, pos_tag):
                  verb_indices.append(i)
          if j == len(v['verbatlas']['roles']) - 1:
              for i in range(elem['span'][1], len(pos_tag)):
                  if isVerb(example, i, pos_tag):
                      verb_indices.append(i)
          pred = elem['span'][1]
      ret.append(verb_indices)
    return ret

def get_roles_indices(example):
    '''
    This function returns the indices of the roles inside the sentence.
    it is a list of lists, each list is associated to a verb with inside the indices of the roles associated to that verb
    '''
    indices = []
    for v in example['srl']['hypothesis']['annotations']:
      r = []
      for elem in v['verbatlas']['roles']:
          r.append(elem['span'])
      indices.append(r)
    return indices

def get_roles(example):
    '''
    This function returns the roles inside the sentence.
    it is a list of lists, each list is associated to a verb with inside the roles associated to that verb
    '''
    roles = []
    for v in example['srl']['hypothesis']['annotations']:
      r = []
      for elem in v['verbatlas']['roles']:
          r.append(str(elem['role']))
      roles.append(r)
    return roles

def get_index(example, word):
    '''
    This function returns the index of the word inside the sentence.
    '''
    example['srl']['hypothesis']['tokens']
    for i, token in enumerate(example['srl']['hypothesis']['tokens']):
        if token['rawText'] == word:
            return i
    return -1

In [None]:
def flip_verb(example):
  '''
  This function tryes to flip the verb in the sentence using its antyom or by 'manual' substituion to specific verbs
  externally the label is flipped too (contradicts->entailment and vice-versa), and the sentences with NEUTRAL label are skipped
  '''
  new_sentence = example['hypothesis']
  l_roles = get_roles(example)
  verbs_indices = get_verb_index(example)

  if example['label'] == 'NEUTRAL': # i'm flipping the sense og the sentence to pass from CONTRADICTION to ENTAILMENT and viceversa.
      return new_sentence

  if len(verbs_indices) == 1 and len(verbs_indices[0]) == 1 and not any('Negation' in roles for roles in l_roles):
      ann = example['wsd']['hypothesis'][verbs_indices[0][0]]
      if ann['wnSynsetOffset'] != 'O':
          synset = wn.synset_from_pos_and_offset(ann['wnSynsetOffset'][-1], int(ann['wnSynsetOffset'][:-1]))
          antyom = get_antonym(synset)
          if antyom:
              new_sentence = new_sentence.replace(ann['text'], antyom)
              return new_sentence

  # avoids sentences with many verbs to avoid difficult cases that could lead to wrong generation
  verbs_count = 0
  for verb_indices in verbs_indices:
      if len(verb_indices) > 0:
        verbs_count +=  1
  if verbs_count > 1:
      return new_sentence

  if ' is not ' in new_sentence:
      new_sentence = new_sentence.replace(' is not ', " is ")
  elif ' is ' in new_sentence:
      new_sentence = new_sentence.replace(' is ', " is not ")
  elif " isn't " in new_sentence:
      new_sentence = new_sentence.replace(" isn't ", " is ")
  elif ' are not ' in new_sentence:
      new_sentence = new_sentence.replace(' are not ', " are ")
  elif ' are ' in new_sentence:
      new_sentence = new_sentence.replace(' are ', " are not ")
  elif " aren't " in new_sentence:
      new_sentence = new_sentence.replace(" aren't ", " are ")
  elif ' was not ' in new_sentence:
      new_sentence = new_sentence.replace(' was not ', " was ")
  elif ' was ' in new_sentence:
      new_sentence = new_sentence.replace(' was ', " was not ")
  elif " wasn't " in new_sentence:
      new_sentence = new_sentence.replace(" wasn't ", " was ")
  elif ' were not ' in new_sentence:
      new_sentence = new_sentence.replace(' were not ', " were ")
  elif ' were ' in new_sentence:
      new_sentence = new_sentence.replace(' were ', " were not ")
  elif " weren't " in new_sentence:
      new_sentence = new_sentence.replace(" weren't ", " were ")
  elif ' has not ' in new_sentence:
      new_sentence = new_sentence.replace(' has not ', " has ")
  elif ' has ' in new_sentence:
      new_sentence = new_sentence.replace(' has ', " has not ")
  elif " hasn't " in new_sentence:
      new_sentence = new_sentence.replace(" hasn't ", " has ")
  elif ' have not ' in new_sentence:
      new_sentence = new_sentence.replace(' have not ', " have ")
  elif ' have ' in new_sentence:
      new_sentence = new_sentence.replace(' have ', " have not ")
  elif ' had not ' in new_sentence:
      new_sentence = new_sentence.replace(' had not ', " had ")
  elif ' had ' in new_sentence:
      new_sentence = new_sentence.replace(' had ', " had not ")

  return new_sentence

def flip_with_anty(example):
  '''
  this function uses the flip_sentence_verb function to flip the verb and then flips the elem of the sentence that is 'reciving' the action.
  This double flip brings in a situation where the label in this case is not flipped, beacause both verb and elem that 'recives' the action are flipped
  '''
  l_roles = get_roles(example)
  new_sentence = flip_verb(example)
  if new_sentence == example['hypothesis']:
      return example['hypothesis']

  ret = new_sentence
  indices = []
  i = 0
  for j, roles in enumerate(l_roles):
    if 'Attribute' in roles:
        indices = get_roles_indices(example)[j][roles.index('Attribute')] # choose the NOUN to change in 'Attribut' part
        break
    elif 'Theme' in roles:
        indices = get_roles_indices(example)[j][roles.index('Theme')] # choose the NOUN to change in 'Theme' part
        break

  if len(indices) == 0:
      return example['hypothesis']

  for annotation in example['wsd']['hypothesis']:
      if annotation['pos'] == 'NOUN' and i in list(range(indices[0], indices[1])) and annotation['wnSynsetOffset'] != 'O' and not annotation['text'].istitle():
          synset = wn.synset_from_pos_and_offset(annotation['wnSynsetOffset'][-1], int(annotation['wnSynsetOffset'][:-1]))
          antonym = get_antonym(synset)
          if antonym:
              ret = new_sentence.replace(annotation['text'], antonym)
              break
      i += 1
  if ret != new_sentence:
      return ret
  else:
      return example['hypothesis']

In [None]:
def test_flip():
    d = {0: 'flip_with_anty', 1: 'flip_with_anty' , 2: 'flip_verb'}
    for i, example in enumerate(dataset['train']):
        r = random.randint(0, 2)
        if r == 0 or r == 1:
            s = flip_with_anty(example) #more difficult to do
        elif r == 2:
            s = flip_verb(example)
        if s != example['hypothesis']:
            print(d[r] + ':')
            print('   ' + example['hypothesis'])
            print('   ' + s)
            if r == 2:
                print('**FLIP LABEL**')
            print('#####')
#test_flip()

In [None]:
def get_tag(text, pos_tag):
    '''
    this function returns the tag of the word in the sentence
    '''
    for elem in pos_tag:
        if elem[0] == text:
            return elem[1]
    return ''

def get_verb_infos(example, direct_object):
    '''
    this function returns the tense, the person and the passive form of the verb in the sentence
    the person to return is the person of the direct object the verb refers to, cause we need to pass it to passive

    VB Verb, base form
    VBD Verb, past tense
    VBG Verb, gerund or present participle
    VBN Verb, past participle
    VBP Verb, non-3rd person singular present
    VBZ Verb, 3rd person singular present
    '''
    verbs_indices = get_verb_index(example)
    pos_tag = nltk.pos_tag(nltk.word_tokenize(example['hypothesis']))
    l_roles = get_roles(example)
    l_roles_indices = get_roles_indices(example)

    target_roles_idx = -1
    for i, roles in enumerate(l_roles):
        if 'Theme' in roles and 'Agent' in roles:
            target_roles_idx = i
            break
        elif 'Patient' in roles and 'Agent' in roles:
            target_roles_idx = i
            break
    if target_roles_idx == -1:
        return {}

    sentence_list = []
    for k in example['srl']['hypothesis']['tokens']:
        sentence_list.append(k['rawText'])

    #search person -> of the direct object in order to decline correctly the verb
    if 'Theme' in l_roles[target_roles_idx]:
        theme_idx = l_roles_indices[target_roles_idx][l_roles[target_roles_idx].index('Theme')]
    elif 'Patient' in l_roles[target_roles_idx]:
        theme_idx = l_roles_indices[target_roles_idx][l_roles[target_roles_idx].index('Patient')]
    person = ''
    for j in range(theme_idx[0], theme_idx[1]):
        if sentence_list[j] == direct_object:
            tag = get_tag(sentence_list[j], pos_tag)
            if tag == '':
                return {}
            if tag in ['NNPS', 'NNS']: #plural
                person = 'Plural'
                break
            elif tag in ['NNP', 'NN']: #singular
                person = 'Singular'

    if person == '':
        return {}

    #search tense
    tense = ''
    verb_text = ''
    verb_index = verbs_indices[0]
    tag = get_tag(sentence_list[verb_index[0]], pos_tag)
    if tag in ['VBD', 'VBN']:
        tense = 'Past'
        verb_text = sentence_list[verb_index[0]]
    elif tag in ['VBP', 'VBZ', 'VB']:
        tense = 'Present'
        verb_text = sentence_list[verb_index[0]]
        if tag in ['VBZ'] and verb_text.endswith('s'):
            verb_text = verb_text[:-1]
        elif tag in ['VBZ'] and verb_text.endswith('es'):
            verb_text = verb_text[:-2]
    elif tag in ['VBG']:
        return {'tense': tense, 'person': person, 'Passive': ''}

    if tense == '':
        return {}

    #search passive
    passive = ''
    if person == 'Plural':
        if tense == 'Present':
            passive += 'are ' + verb_text + 'ed'
        elif tense == 'Past':
            passive += 'were ' + verb_text
    else:
        if tense == 'Present':
            passive += 'is ' + verb_text + 'ed'
        elif tense == 'Past':
            passive += 'was ' + verb_text

    return {'tense': tense, 'person': person, 'Passive': passive}

def isProperNoun(X, example):
    '''
    this function checks if the word passed with X is a proper noun
    '''
    for annotation in example['wsd']['hypothesis']:
        if annotation['text'] == X[0] and annotation['pos'] in ['PROPN']:
            return True
    return False

def findDirectObject(example):
    '''
    this function finds the direct object of the sentence
    '''
    doc = spacy_model(example['hypothesis'])
    for token in doc:
        if token.dep_ == "dobj":
            return token.text
    return None


def passive_form(example):
    '''
    this function inverts the sentence froma active to a passive form
    The Tense in Enlgish that have verb with only one word are Present Simple, Past Simple, Imperative, Gerund
    and there must be a Direct object founded with spacy

    a the end there's the use of a grammar model to correct in case of errors in generating the passive form for irregular verbs
    ex:
    - "Tennis was avoided by Roger Federer entirely." -> "Tennis was avoided entirely by Roger Federer."
    - "An NBA record is holded by the Los Angeles Lakers." -> "An NBA record is held by the Los Angeles Lakers."
    '''
    l_roles = get_roles(example)
    l_roles_indices = get_roles_indices(example)
    verbs_indices = get_verb_index(example)

    # find the right set of roles in the sentece (need to include 'Theme' and 'Agent' or 'Patient' and 'Agent')
    target_roles_idx = -1
    for i, roles in enumerate(l_roles):
        if 'Theme' in roles and 'Agent' in roles:
            target_roles_idx = i
            break
        elif 'Patient' in roles and 'Agent' in roles:
            target_roles_idx = i
            break
    if target_roles_idx == -1:
        return example['hypothesis']

    roles_indices = l_roles_indices[target_roles_idx]
    roles = l_roles[target_roles_idx]

    if len(verbs_indices) != 1 or len(verbs_indices[0]) != 1: # avoids verbs compused by many words or sentences with many verbs because can't modify them easly
        return example['hypothesis']

    if 'Negation' in roles: # it is the case of composed verbs but not detected above
        return example['hypothesis']

    verb_index = verbs_indices[0][0]

    i = roles_indices[roles.index('Agent')]
    if 'Theme' in roles:
        j = roles_indices[roles.index('Theme')]
    elif 'Patient' in roles:
        j = roles_indices[roles.index('Patient')]

    if i[0] > j[0]:
        i, j = j, i

    if not (i[1] <= verb_index and verb_index < j[0]):  # verb not between the 2, i want 'X does Y' scheme
        return example['hypothesis']

    sl = []
    for k in example['srl']['hypothesis']['tokens']:
        sl.append(k['rawText'])

    direct_object = findDirectObject(example)
    if direct_object == None: # i'm interested in sentences with a direct object
        return example['hypothesis']

    p_t = get_verb_infos(example, direct_object)

    if p_t == {} or p_t['Passive'] == '':
        return example['hypothesis']

    start = sl[:i[0]]
    X = sl[i[0]:verb_index]
    Y = sl[verb_index + 1:j[1]]
    end = sl[j[1]:]

    V = [p_t['Passive'] + ' by']

    if len(start) == 0:
        Y[0] = Y[0].capitalize()
        if not isProperNoun(X, example):
            X[0] = X[0].lower()
    ret = ' '.join(start + Y + V + X + end)

    if ret.endswith(' .') and not example['hypothesis'].endswith(' .'):
        ret = ret[:-2] + '.'
    elif ret.endswith(' ?') and not example['hypothesis'].endswith(' ?'):
        ret = ret[:-2] + '?'
    elif ret.endswith(' !') and not example['hypothesis'].endswith(' !'):
        ret = ret[:-2] + '!'

    if " 's" not in example['hypothesis'] and " 's" in ret:
        ret = ret.replace(" 's", "'s")

    # corrects the phrases that are wrong (irregular verb built in a bad way)
    result = happy_tt.generate_text("grammar: " + ret, args=TTSettings(num_beams=5, min_length=1)).text
    return result


def passive_generic(example, version):
    '''
    this function does the same things of the passive_form function but the Agent or the Direct object are 'generalized'.
    version parameter specify if Agent or Direct object are generalized.
    The label is passed to 'Neutral' externally because the sentence is generalized.

    - "Tennis was avoided by Roger Federer entirely." -> "Tennis was avoided entirely by someone."
    - "An NBA record is holded by the Los Angeles Lakers." -> "Something is held by the Los Angeles Lakers."
    '''

    l_roles = get_roles(example)
    l_roles_indices = get_roles_indices(example)
    verbs_indices = get_verb_index(example)

    # find the right set of roles in the sentece (need to include 'Theme' and 'Agent' or 'Patient' and 'Agent')
    target_roles_idx = -1
    for i, roles in enumerate(l_roles):
        if 'Theme' in roles and 'Agent' in roles:
            target_roles_idx = i
            break
        elif 'Patient' in roles and 'Agent' in roles:
            target_roles_idx = i
            break
    if target_roles_idx == -1:
        return example['hypothesis']

    roles_indices = l_roles_indices[target_roles_idx]
    roles = l_roles[target_roles_idx]

    if len(verbs_indices) != 1 or len(verbs_indices[0]) != 1: # avoids verbs compused by many words or sentences with many verbs because can't modify them easly
        return example['hypothesis']

    if 'Negation' in roles: # it is the case of composed verbs but not detected above
      return example['hypothesis']

    verb_index = verbs_indices[0][0]

    i = roles_indices[roles.index('Agent')]
    if 'Theme' in roles:
        j = roles_indices[roles.index('Theme')]
    elif 'Patient' in roles:
        j = roles_indices[roles.index('Patient')]

    if i[0] > j[0]:
        i, j = j, i

    if not (i[1] <= verb_index and verb_index < j[0]):  # verb not between the 2, i want 'X does Y' scheme
        return example['hypothesis']

    sl = []
    for k in example['srl']['hypothesis']['tokens']:
        sl.append(k['rawText'])

    direct_object = findDirectObject(example)
    if direct_object == None: # i'm interested in sentences with a direct object
        return example['hypothesis']

    p_t = get_verb_infos(example, direct_object)

    if p_t == {} or p_t['Passive'] == '':
        return example['hypothesis']

    start = sl[:i[0]]
    X = sl[i[0]:verb_index]
    Y = sl[verb_index + 1:j[1]]
    end = sl[j[1]:]

    if version == 'Agent':
        X = ['someone']
    else:
        if p_t['person'] == 'Plural':
            Y = ['some things']
        else:
            Y = ['something']

    V = [p_t['Passive'] + ' by']

    if len(start) == 0:
        Y[0] = Y[0].capitalize()
        if not isProperNoun(X, example):
            X[0] = X[0].lower()
    ret = ' '.join(start + Y + V + X + end)

    if ret.endswith(' .') and not example['hypothesis'].endswith(' .'):
        ret = ret[:-2] + '.'
    elif ret.endswith(' ?') and not example['hypothesis'].endswith(' ?'):
        ret = ret[:-2] + '?'
    elif ret.endswith(' !') and not example['hypothesis'].endswith(' !'):
        ret = ret[:-2] + '!'

    if " 's" not in example['hypothesis'] and " 's" in ret:
        ret = ret.replace(" 's", "'s")

    # corrects the phrases that are wrong (irregular verb built in a bad way)
    result = happy_tt.generate_text("grammar: " + ret, args=TTSettings(num_beams=5, min_length=1)).text

    return(result)

In [None]:
def test_passive():
    d = {0: 'passive_form', 1: 'generalization-Agent', 2:'generalization-directObject'}
    for i, example in enumerate(dataset['train']):
        r = random.randint(0, 2)
        if r == 0:
            s = passive_form(example)
        elif r == 1:
            s = passive_generic(example, 'Agent')
        elif r == 2:
            s = passive_generic(example, 'Direct object')
        if s != example['hypothesis']:
            print(d[r] + ':')
            print('   ' + example['hypothesis'])
            print('   ' + s)
            print('#####')
#test_passive()

In [None]:
def check_for_roles_shuffle(s1, s2):
    '''
    The func. checks if the 2 strings contains the same words to see if the shuffle didn't delete any word
    '''
    l1 = nltk.word_tokenize(s1)
    l2 = nltk.word_tokenize(s2)
    l1 = [x.lower() for x in l1 if x != ',']
    l2 = [x.lower() for x in l2 if x != ',']
    c1 = Counter(l1)
    c2 = Counter(l2)
    return c1 == c2

def shuffle_roles(example):
    '''
    this function changes the position of some specific roles of the sentence
    in particular it shuffles some roles that doesn't affect the sense of the sentence if moved at random in the sentence like 'Location', 'Time', 'Cause'
    '''
    s = example['hypothesis']
    l_roles = get_roles(example)
    l_roles_indices = get_roles_indices(example)
    verbs_indices = get_verb_index(example)

    # we take as a target the first group of roles non empty
    target_roles_idx = -1
    for j, roles in enumerate(l_roles):
        if len(roles) > 0:
            target_roles_idx = j
            break
    if target_roles_idx == -1:
        return example['hypothesis']

    roles = l_roles[target_roles_idx]
    roles_indices = l_roles_indices[target_roles_idx]
    verb_indices = verbs_indices[target_roles_idx]

    if len(roles_indices) < 3: # roles not in the database or too few roles for a good result
        return s

    if not ('Location' in roles or 'Time' in roles or 'Cause' in roles): # i'm interested those parts that can be moved
        return s

    if ' - ' in example['hypothesis']: # only 3 sentences in all the db have ' - ' and that are not good for this transformation
        return example['hypothesis']

    # add to the roles list the Verbs
    r_i = []
    j = 0
    for e in list(zip(roles, roles_indices)):
      while len(verb_indices) < j and verb_indices[j] < e[1][0]:
          r_i.append(['Verb', verb_indices[j]])
          j += 1
      r_i.append(e)

    # remove the part to be shifted
    to_move = []
    copy_r_i = r_i.copy()
    for i in r_i:
        if i[0] == 'Location':
            to_move = i
            copy_r_i.remove(i)
            break
        elif i[0] == 'Time':
            to_move = i
            copy_r_i.remove(i)
            break
        elif i[0] == 'Cause':
            to_move = i
            copy_r_i.remove(i)
            break

    # choose a new random position for the sentence part
    random.seed(42)
    x = random.randint(0, len(copy_r_i))
    while x == r_i.index(to_move):
        x = random.randint(0, len(copy_r_i))

    # create the final sentence
    sl = []
    for word in example['srl']['hypothesis']['tokens']:
        sl.append(word['rawText'])

    # the firs word needs to be lower if moved
    if not isProperNoun(sl, example):
        sl[0] = sl[0].lower()

    # find the new starting postition of the element to move
    new_start = -1
    text_to_move_len = len(sl[to_move[1][0]:to_move[1][1]])
    for j, e in enumerate(copy_r_i):
        if j == x:
            new_start = e[1][0]
        elif j == 0 and x == 0:
            new_start = 0
        elif j == len(copy_r_i) - 1 and x == len(copy_r_i):
            new_start = len(sl) - 1

    #create the new sentence
    text_to_move = ' '.join(sl[to_move[1][0]:to_move[1][1]])
    ret = ''
    for i, word in enumerate(sl):
        if i >= to_move[1][0] and i < to_move[1][1]: #skip the text to move part
            continue
        if i == new_start: # add the part to be moved by adding before and after commas (except special cases)
            if i > 0 and ret[-2:] != ', ' and ret.endswith(' '):
                ret = ret[:-1] + ', '
            ret += text_to_move
            if word not in [',', '.', '?', '!', '-', ':']:
                ret += ', '
        if i < len(sl) - 1 and (sl[i + 1] not in [',', '.', '?', '!', ':'] or i + 1 == new_start):
            ret += word + ' '
        else:
            ret += word

    ret = ret.replace(' - ', '-') # the modification adds incorrectly spaces before and after '-'

    if " 's" not in example['hypothesis'] and " 's" in ret: # the modification adds incorrectly a space before the 's
        ret = ret.replace(" 's", "'s")

    # the modification adds incorrectly a space before . ? !
    if ret.endswith(' .') and not example['hypothesis'].endswith(' .'):
        ret = ret[:-2] + '.'
    elif ret.endswith(' ?') and not example['hypothesis'].endswith(' ?'):
        ret = ret[:-2] + '?'
    elif ret.endswith(' !') and not example['hypothesis'].endswith(' !'):
        ret = ret[:-2] + '!'
    if s[0].isupper():
        ret = ret[0].upper() + ret[1:]

    if ret.startswith(', '):
        ret = ret[2:]

    if ret == s:
        return s

    r2 = ret.replace(',', '').replace(' ', '')
    s2 = s.replace(',', '').replace(' ', '')
    # in case that the sentence is the same but the add of ',' or ' ' makes them to seem different
    if r2 == s2:
        return s

    if not check_for_roles_shuffle(ret, s):
        return s
    return ret

In [None]:
def test_shuffle_roles():
    for i, example in enumerate(dataset['train']):
        s = shuffle_roles(example)
        if s != example['hypothesis']:
            print('   ' + example['hypothesis'])
            print('   ' + s)
            print('#####')
#test_shuffle_roles()

In [None]:
def make_as_question(example):
    '''
    this function converts the sentence to a question
    '''
    l_roles = get_roles(example)
    l_roles_indices = get_roles_indices(example)
    verbs_indices = get_verb_index(example)

    # avoids verbs compused by many words or sentences with many verbs because can't modify them easly
    # accept the case of auxiliary verbs (ex: "Elvis was co-produced by a man." verbs_indices = [[], [1]])
    if len(verbs_indices) != 1 or len(verbs_indices[0]) != 1:
        if not (len(verbs_indices) == 2 and len(verbs_indices[0]) == 0 and len(verbs_indices[1]) == 1):
          return example['hypothesis']

    if len(verbs_indices) == 2:
        verb_index = verbs_indices[1][0]
        roles = l_roles[1]
        roles_indices = l_roles_indices[1]
    else:
        verb_index = verbs_indices[0][0]
        roles = l_roles[0]
        roles_indices = l_roles_indices[0]

    if len(roles) == 0:
        return example['hypothesis']

    if 'Agent' not in roles and 'Theme' not in roles: # case in which the tranformation leads to a meaningfull new sentence
        return example['hypothesis']

    if 'Negation' in roles: # difficult to deal with
        return example['hypothesis']

    if 'Agent' in roles:
        idx = roles_indices[roles.index('Agent')]
    else:
        idx = roles_indices[roles.index('Theme')]

    if idx[1] > verb_index: # the elem doing the action needs to be before the verb
        return example['hypothesis']

    # pos tag to recognize the verb
    pos_tag = nltk.pos_tag(nltk.word_tokenize(example['hypothesis']))

    sentence_list = []
    for k in example['srl']['hypothesis']['tokens']:
        sentence_list.append(k['rawText'])

    # create the new verb text as a question
    verb_text = sentence_list[verb_index]
    verb_tag = get_tag(verb_text, pos_tag)
    start = ''
    if verb_tag in ['VBD']:
        if verb_text not in ['was', 'were']:
            start = 'Did '
        else:
            start = verb_text.capitalize() + ' '
    elif verb_tag in ['VBP', 'VB']:
        if verb_text not in ["'m", 'are', 'am']:
            start = 'Do '
        else:
            if verb_text == "'m":
                start = 'Am '
            else:
                start = verb_text.capitalize() + ' '
    elif verb_tag in ['VBZ']:
        if verb_text not in ['is']:
            if verb_text.endswith('s'):
                verb_text = verb_text[:-1]
            start = 'Does '
        else:
            start = verb_text.capitalize() + ' '
    else:
        return example['hypothesis']


    # get agent text
    agent = ' '.join(sentence_list[idx[0]:idx[1]])

    # create question
    if verb_text in ['is', 'are', 'was', 'were']:
        ret = start + agent + ' ' + ' '.join(sentence_list[verb_index + 1:])
    else:
        ret = start + agent + ' ' + verb_text + ' ' + ' '.join(sentence_list[verb_index + 1:])

    if ret.endswith(' .') and not example['hypothesis'].endswith(' .'):
        ret = ret[:-2] + '?'
    elif ret.endswith(' ?') and not example['hypothesis'].endswith(' ?'):
        ret = ret[:-2] + '?'
    elif ret.endswith(' !') and not example['hypothesis'].endswith(' !'):
        ret = ret[:-2] + '?'
    else:
        ret += '?'

    if ' - ' in example['hypothesis']: # the modification adds incorrectly spaces before and after '-'
        return example['hypothesis']
    ret = ret.replace(' - ', '-')

    if " 's" not in example['hypothesis'] and " 's" in ret: # the modification adds incorrectly a space before the 's
        ret = ret.replace(" 's", "'s")

    # apply grammar correction for bad tranforamtions (irregular verbs)
    result = happy_tt.generate_text("grammar: " + ret, args=TTSettings(num_beams=5, min_length=1)).text

    return result

def question_answer_modify(example, new_sentence, change_function, POS, answer):
    '''
    this function applies an augmentation to a sentence that has been tranformed to a question and can add an answer (Yes or No)
    '''

    #int the case the answer is No, the label needs to be flipped but it can't be if it is NEUTRAL
    if answer == 'No':
        if example['label'] == 'NEUTRAL':
            return new_sentence

    if change_function != None:
        s = augment_sentence(example, new_sentence, change_function, POS)
        if s != new_sentence:
            if answer == 'Yes':
                s += ' Yes.'
            elif answer == 'No':
                s += ' No.'
    else:
        s = new_sentence
        if answer == 'Yes':
            s += ' Yes.'
        elif answer == 'No':
            s += ' No.'
    return s

In [None]:
def test_question():
    d = {0: 'question', 1: 'question_augmented', 2:'question_augmented-yes', 3:'question_augmented-no'}
    for i, example in enumerate(dataset['train']):
        r = random.randint(0, 3)
        s = make_as_question(example)
        if r == 1:
            s2 = question_answer_modify(example, s, get_synonym, 'NOUN', '') #ADJ
            if s2 == s:
                continue
            s = s2
        elif r == 2:
            s2 = question_answer_modify(example, s, get_synonym, 'NOUN', 'Yes') #ADJ
            if s2 == s:
                continue
            s = s2
        elif r == 3:
            s2 = question_answer_modify(example, s, get_synonym, 'NOUN', 'No') #ADJ
            if s2 == s:
                continue
            s = s2
        if s != example['hypothesis']:
            print(d[r] + ':')
            print('   ' + example['hypothesis'])
            print('   ' + s)
            print('#####')
#test_question()

In [None]:
def flip_label(example):
    if example['label'] == 'CONTRADICTION':
        example['label'] = 'ENTAILMENT'
    elif example['label'] == 'ENTAILMENT':
        example['label'] = 'CONTRADICTION'

In [None]:
def sum_subgroup(results, name):
    '''
    count transformation made for each group
    '''
    sum = 0
    for e in results[name]:
        sum += results[name][e]
    return sum

In [None]:
results = {
  'passive' : {
    'passive_normal' : 0,
    'passive_syn_adj' : 0,
    'passive_syn_noun' : 0,
    'passive_hype_noun' : 0,
    'passive_hypo_noun' : 0,
    'passive_agent_gen' : 0,
    'passive_directObject_gen' : 0,
  },
  'flip' : {
    'flip_normal' : 0,
    'flip_syn_adj' : 0,
    'flip_syn_noun' : 0,
    'flip_hype_adj' : 0,
    'flip_hype_noun' : 0,
    'flip_anty' : 0
  },
  'shuffled_roles' : {
    'shuffled_roles_normal' : 0,
    'shuffled_roles_syn_adj' : 0,
    'shuffled_roles_syn_noun' : 0,
    'shuffled_roles_hype_noun' : 0
  },
  'question' : {
    'question_normal' : 0,
    'question_syn_adj_yes' : 0,
    'question_syn_noun_yes' : 0,
    'question_syn_adj_no' : 0,
    'question_syn_noun_no' : 0,
    'question_syn_noun' : 0,
    'question_syn_adj' : 0,
    'question_no': 0,
    'question_yes': 0
  },
  'augment' : {
    'synonym_adj' : 0,
    'synonym_noun' : 0,
    'synonym_verb' : 0,
    'synonym_adv' : 0,
    'hyponym_adj' : 0,
    'hyperonym_noun' : 0,
    'meronym_noun' : 0
  }
}

not_modified = 0

dataset = dataset.shuffle(seed=42)
train_dataset = dataset['train']
'''
this part iterates on the elems of the dataset and applies the transformations based on a specific order (based on most difficult transf. to apply to the easiest)
There is also a limit to each tranformation
it takes a bit cause some function uses model, it speedup if using a GPU
'''
with open('train_dataset.json', 'w') as f:
  with open('errors.json', 'w') as f2:
    for i, elem in enumerate(train_dataset):
        if i % 1000 == 0:
            print(i)
        if sum_subgroup(results, 'passive') < 5000:
            result = results['passive']
            new_sentence = passive_form(elem)
            if new_sentence != elem['hypothesis']:
                if result['passive_syn_adj'] < 300:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_synonym, 'ADJ')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['passive_syn_adj'] += 1
                        continue
                if result['passive_syn_noun'] < 1200:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_synonym, 'NOUN')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['passive_syn_noun'] += 1
                        continue
                if result['passive_agent_gen'] < 1000:
                    augmented_sentence = passive_generic(elem, 'Agent')
                    if augmented_sentence != elem['hypothesis']:
                        elem['hypothesis'] = augmented_sentence
                        elem['label'] = 'NEUTRAL'
                        json.dump(elem, f)
                        f.write('\n')
                        result['passive_agent_gen'] += 1
                        continue
                if result['passive_directObject_gen'] < 1000:
                    augmented_sentence = passive_generic(elem, 'DirectObject')
                    if augmented_sentence != elem['hypothesis']:
                        elem['hypothesis'] = augmented_sentence
                        elem['label'] = 'NEUTRAL'
                        json.dump(elem, f)
                        f.write('\n')
                        result['passive_directObject_gen'] += 1
                        continue
                if result['passive_hype_noun'] < 300:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_hypernym, 'NOUN')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['passive_hype_noun'] += 1
                        continue
                if result['passive_hypo_noun'] < 300:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_hyponym, 'NOUN')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['passive_hypo_noun'] += 1
                        continue
                elem['hypothesis'] = new_sentence
                json.dump(elem, f)
                f.write('\n')
                result['passive_normal'] += 1
                continue

        if sum_subgroup(results, 'shuffled_roles') < 5000:
            result = results['shuffled_roles']
            new_sentence = shuffle_roles(elem)
            if new_sentence != elem['hypothesis']:
                if result['shuffled_roles_syn_adj'] < 500:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_synonym, 'ADJ')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['shuffled_roles_syn_adj'] += 1
                        continue
                if result['shuffled_roles_syn_noun'] < 1500:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_synonym, 'NOUN')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['shuffled_roles_syn_noun'] += 1
                        continue
                if result['shuffled_roles_hype_noun'] < 1000:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_hypernym, 'NOUN')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['shuffled_roles_hype_noun'] += 1
                        continue
                elem['hypothesis'] = new_sentence
                json.dump(elem, f)
                f.write('\n')
                result['shuffled_roles_normal'] += 1
                continue

        if sum_subgroup(results, 'flip') < 19000:
            result = results['flip']
            new_sentence = flip_verb(elem)
            if new_sentence != elem['hypothesis']:
                if result['flip_syn_adj'] < 1500:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_synonym, 'ADJ')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        flip_label(elem)
                        json.dump(elem, f)
                        f.write('\n')
                        result['flip_syn_adj'] += 1
                        continue
                if result['flip_syn_noun'] < 4500:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_synonym, 'NOUN')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        flip_label(elem)
                        json.dump(elem, f)
                        f.write('\n')
                        result['flip_syn_noun'] += 1
                        continue
                if result['flip_anty'] < 5500:
                    augmented_sentence = flip_with_anty(elem)
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['flip_anty'] += 1
                        continue
                if result['flip_hype_adj'] < 300:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_hypernym, 'ADJ')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        flip_label(elem)
                        json.dump(elem, f)
                        f.write('\n')
                        result['flip_hype_adj'] += 1
                        continue
                if result['flip_hype_noun'] < 2500:
                    augmented_sentence = augment_sentence(elem, new_sentence, get_hypernym, 'NOUN')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        flip_label(elem)
                        json.dump(elem, f)
                        f.write('\n')
                        result['flip_hype_noun'] += 1
                        continue
                elem['hypothesis'] = new_sentence
                json.dump(elem, f)
                f.write('\n')
                result['flip_normal'] += 1
                continue

        if sum_subgroup(results, 'question') < 18000:
            result = results['question']
            new_sentence = make_as_question(elem)
            if new_sentence != elem['hypothesis']:
                if result['question_syn_adj_no'] < 400:
                    augmented_sentence = question_answer_modify(elem, new_sentence, get_synonym, 'ADJ', 'No')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        flip_label(elem)
                        json.dump(elem, f)
                        f.write('\n')
                        result['question_syn_adj_no'] += 1
                        continue
                if result['question_syn_adj_yes'] < 400:
                    augmented_sentence = question_answer_modify(elem, new_sentence, get_synonym, 'ADJ', 'Yes')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['question_syn_adj_yes'] += 1
                        continue
                if result['question_syn_adj'] < 400:
                    augmented_sentence = question_answer_modify(elem, new_sentence, get_synonym, 'ADJ', '')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        elem['label'] = 'NEUTRAL'
                        json.dump(elem, f)
                        f.write('\n')
                        result['question_syn_adj'] += 1
                        continue
                if result['question_syn_noun_no'] < 1000:
                    augmented_sentence = question_answer_modify(elem, new_sentence, get_synonym, 'NOUN', 'No')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        flip_label(elem)
                        json.dump(elem, f)
                        f.write('\n')
                        result['question_syn_noun_no'] += 1
                        continue
                if result['question_syn_noun_yes'] < 1000:
                    augmented_sentence = question_answer_modify(elem, new_sentence, get_synonym, 'NOUN', 'Yes')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['question_syn_noun_yes'] += 1
                        continue
                if result['question_syn_noun'] < 1000:
                    augmented_sentence = question_answer_modify(elem, new_sentence, get_synonym, 'NOUN', '')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        elem['label'] = 'NEUTRAL'
                        json.dump(elem, f)
                        f.write('\n')
                        result['question_syn_noun'] += 1
                        continue
                if result['question_no'] < 1500:
                    augmented_sentence = question_answer_modify(elem, new_sentence, None, '', 'No')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        flip_label(elem)
                        json.dump(elem, f)
                        f.write('\n')
                        result['question_no'] += 1
                        continue
                if result['question_yes'] < 1500:
                    augmented_sentence = question_answer_modify(elem, new_sentence, None, '', 'Yes')
                    if augmented_sentence != new_sentence:
                        elem['hypothesis'] = augmented_sentence
                        json.dump(elem, f)
                        f.write('\n')
                        result['question_yes'] += 1
                        continue
                elem['hypothesis'] = new_sentence
                elem['label'] = 'NEUTRAL'
                json.dump(elem, f)
                f.write('\n')
                result['question_normal'] += 1
                continue
        if sum_subgroup(results, 'augment') < 20000:
            result = results['augment']
            if result['synonym_adj'] < 4000:
                new_sentence = augment_sentence(elem, elem['hypothesis'], get_synonym, 'ADJ')
                if new_sentence != elem['hypothesis']:
                    elem['hypothesis'] = new_sentence
                    json.dump(elem, f)
                    f.write('\n')
                    result['synonym_adj'] += 1
                    continue
            if result['synonym_noun'] < 5000:
                new_sentence = augment_sentence(elem, elem['hypothesis'], get_synonym, 'NOUN')
                if new_sentence != elem['hypothesis']:
                    elem['hypothesis'] = new_sentence
                    json.dump(elem, f)
                    f.write('\n')
                    result['synonym_noun'] += 1
                    continue
            if result['synonym_verb'] < 5000:
                new_sentence = augment_sentence(elem, elem['hypothesis'], get_synonym, 'VERB')
                if new_sentence != elem['hypothesis']:
                    elem['hypothesis'] = new_sentence
                    json.dump(elem, f)
                    f.write('\n')
                    result['synonym_verb'] += 1
                    continue
            if result['synonym_adv'] < 2000:
                new_sentence = augment_sentence(elem, elem['hypothesis'], get_synonym, 'ADV')
                if new_sentence != elem['hypothesis']:
                    elem['hypothesis'] = new_sentence
                    json.dump(elem, f)
                    f.write('\n')
                    result['synonym_adv'] += 1
                    continue
            if result['hyponym_adj'] < 2000:
                new_sentence = augment_sentence(elem, elem['hypothesis'], get_hyponym, 'ADJ')
                if new_sentence != elem['hypothesis']:
                    elem['hypothesis'] = new_sentence
                    json.dump(elem, f)
                    f.write('\n')
                    result['hyponym_adj'] += 1
                    continue
            if result['hyperonym_noun'] < 5000:
                new_sentence = augment_sentence(elem, elem['hypothesis'], get_hypernym, 'NOUN')
                if new_sentence != elem['hypothesis']:
                    elem['hypothesis'] = new_sentence
                    json.dump(elem, f)
                    f.write('\n')
                    result['hyperonym_noun'] += 1
                    continue
            if result['meronym_noun'] < 2000:
                new_sentence = augment_sentence(elem, elem['hypothesis'], get_meronym, 'NOUN')
                if new_sentence != elem['hypothesis']:
                    elem['hypothesis'] = new_sentence
                    json.dump(elem, f)
                    f.write('\n')
                    result['meronym_noun'] += 1
                    continue
        not_modified += 1
        json.dump(elem, f2)
        f2.write('\n')

        # i decided to improve the perf of the model to not include the 1500 not modifed sentences in the file
        # json.dump(elem, f)
        # f.write('\n')

for x in results:
  print(x, results[x])
print('not modified', not_modified)

In [None]:
print(sum_subgroup(results, 'passive'))
print(sum_subgroup(results, 'flip'))
print(sum_subgroup(results, 'shuffled_roles'))
print(sum_subgroup(results, 'question'))
print(sum_subgroup(results, 'augment'))