In [40]:
import pandas as pd

df = pd.read_csv("COCO_test_sentences.csv")

In [41]:
#drop rows where the sentences in 'raw' column have more than 10 words
df = df[df['raw'].str.split().str.len() <= 8]

In [42]:
df = df.sort_values(['imgid','sentid'],ascending=[True,True])

In [43]:
df_reduced = {
    "imgid":[],
    "sentid":[],
    "cocoid":[],
    "raw":[]
}

imgid_done = set()

for i,row in df.iterrows():
  if row["imgid"] not in imgid_done:
    imgid_done.add(row["imgid"])
    df_reduced["imgid"].append(row["imgid"])
    df_reduced["sentid"].append(row["sentid"])
    df_reduced["cocoid"].append(row["cocoid"])
    df_reduced["raw"].append(row["raw"])


df = pd.DataFrame(df_reduced)

In [44]:
import numpy as np
len(df), np.mean(df['raw'].str.split().str.len())

(2948, 7.945386702849389)

In [45]:
from transformers import pipeline

classifier = pipeline("fill-mask")

No model was supplied, defaulted to distilroberta-base and revision ec58a5b (https://huggingface.co/distilroberta-base).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [46]:
classifier("A <mask> in a machine.")

[{'score': 0.1332135647535324,
  'token': 37204,
  'token_str': ' cog',
  'sequence': 'A cog in a machine.'},
 {'score': 0.032206032425165176,
  'token': 19012,
  'token_str': ' needle',
  'sequence': 'A needle in a machine.'},
 {'score': 0.022915644571185112,
  'token': 6793,
  'token_str': ' virus',
  'sequence': 'A virus in a machine.'},
 {'score': 0.018956581130623817,
  'token': 4683,
  'token_str': ' hole',
  'sequence': 'A hole in a machine.'},
 {'score': 0.014165378175675869,
  'token': 14368,
  'token_str': ' dent',
  'sequence': 'A dent in a machine.'}]

In [47]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline

model_name = "QCRI/bert-base-multilingual-cased-pos-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)


Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [48]:
pipeline = TokenClassificationPipeline(model=model, tokenizer=tokenizer)
outputs = pipeline(["A man with a red helmet on a small moped on a dirt road.","A man with a red helmet on a small moped on a dirt road."])
print(outputs)

[[{'entity': 'DT', 'score': 0.99982494, 'index': 1, 'word': 'A', 'start': 0, 'end': 1}, {'entity': 'NN', 'score': 0.99958414, 'index': 2, 'word': 'man', 'start': 2, 'end': 5}, {'entity': 'IN', 'score': 0.9995628, 'index': 3, 'word': 'with', 'start': 6, 'end': 10}, {'entity': 'DT', 'score': 0.9998672, 'index': 4, 'word': 'a', 'start': 11, 'end': 12}, {'entity': 'JJ', 'score': 0.99914205, 'index': 5, 'word': 'red', 'start': 13, 'end': 16}, {'entity': 'NN', 'score': 0.99970347, 'index': 6, 'word': 'hel', 'start': 17, 'end': 20}, {'entity': 'NN', 'score': 0.9995078, 'index': 7, 'word': '##met', 'start': 20, 'end': 23}, {'entity': 'IN', 'score': 0.9998085, 'index': 8, 'word': 'on', 'start': 24, 'end': 26}, {'entity': 'DT', 'score': 0.999866, 'index': 9, 'word': 'a', 'start': 27, 'end': 28}, {'entity': 'JJ', 'score': 0.9992562, 'index': 10, 'word': 'small', 'start': 29, 'end': 34}, {'entity': 'NN', 'score': 0.75575167, 'index': 11, 'word': 'mo', 'start': 35, 'end': 37}, {'entity': 'VBD', 'sc

In [49]:
import datasets
ds = datasets.Dataset.from_pandas(df)

In [50]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [51]:
from nltk.corpus import wordnet

def is_noun(word):
    synsets = wordnet.synsets(word.lower().strip())
    return any([synset.pos() == wordnet.NOUN for synset in synsets])


def are_synonyms(word1, word2):

    synsets1 = wordnet.synsets(word1.lower().strip())
    synsets2 = wordnet.synsets(word2.lower().strip())
    return any(set(synset1.lemma_names()).intersection(synset2.lemma_names()) for synset1 in synsets1 for synset2 in synsets2)

def is_subtype(word_subtype, word_type):
    synsets_subtype = wordnet.synsets(word_subtype.lower().strip())
    synsets_type = wordnet.synsets(word_type.lower().strip())

    for syn_subtype in synsets_subtype:
        hypernyms = set([i for i in syn_subtype.closure(lambda s:s.hypernyms())])

        for syn_type in synsets_type:
            if syn_type in hypernyms:
                return True

    return False


word1 = "happy"
word2 = "joyful"

print(are_synonyms(word1, word2))

False


In [52]:
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

df_new = {
    "imgid":[],
    "sentid":[],
    "cocoid":[],
    "base_sentence":[],
    "new_sentence":[]
}

skip_words = set([
    "man",
    "woman",
    "boy",
    "girl",
    "person",
    "people",
    "group",
    "pair",
    "couple",
    "team",
    "someone",
    "individual",
    "somebody",
    "child",
    "baby",
    "squad",
    "grouping",
    "bunch",
    "crowd",
    "adult",
    "guy",
    "image",
    "photo",
    "picture",
    "photograph",
    "scene",
    "herd",
    "flock",
    "pair"
])


for data in tqdm(ds):

  base_sentence = data["raw"]
  POS_result = pipeline(base_sentence)
  successful_NN = False

  for i,result in enumerate(POS_result):
    if successful_NN:
      break
    if result["entity"]=="NN" and result["word"].lower().strip() not in skip_words and not result["word"].startswith("##"):
      candidate_word = result["word"]

      k=1
      while True:
        if i+k >= len(POS_result):
          break
        if not POS_result[i+k]["word"].startswith("##"):
          break
        else:
          candidate_word+=POS_result[i+k]["word"][len("##"):]
          k+=1

      #skip if the candidate word is in the base sentence more than once
      if base_sentence.count(candidate_word) > 1:
        continue

      if len(candidate_word) < 3:
        continue

      if not is_noun(candidate_word):
        continue

      masked_sentence = base_sentence.replace(candidate_word, "<mask>")

      fill_mask_result = classifier(masked_sentence)

      new_word = None

      for result in fill_mask_result:
        new_word = result["token_str"]

        if len(new_word) < 3:
          new_word = None
          break

        if not is_noun(new_word):
          new_word = None
          continue

        if new_word in base_sentence:
          new_word = None
          continue

        if are_synonyms(new_word, candidate_word):
          new_word = None
          continue

        if is_subtype(new_word, candidate_word) or is_subtype(candidate_word, new_word):
          new_word = None
          continue
      
      if new_word is None:
        continue

      replaced_sentence = fill_mask_result[0]["sequence"]




      if base_sentence != replaced_sentence:
        df_new["imgid"].append(data["imgid"])
        df_new["sentid"].append( data["sentid"])
        df_new["cocoid"].append( data["cocoid"])
        df_new["base_sentence"].append(base_sentence)
        df_new["new_sentence"].append( replaced_sentence)

      successful_NN = True






100%|██████████| 2948/2948 [06:14<00:00,  7.87it/s]


In [53]:
df2 = pd.DataFrame(df_new)
len(df2)

2093

In [54]:
#sample 500 with seed 42

df2 = df2.sample(500,random_state=42)

In [55]:
df2.to_csv("COCO_test_pairs.csv", index=False)

df2 = pd.read_csv("COCO_test_pairs.csv")

In [58]:
with open("COCO_test_pairs_for_ADV.txt","w") as f:
    for i, row in df2.iterrows():
        print("""("{}", "{}"),""".format(row["base_sentence"], row["new_sentence"]))
        print("""("{}", "{}"),""".format(row["new_sentence"], row["base_sentence"]))

        f.write("""("{}", "{}"),\n""".format(row["base_sentence"], row["new_sentence"]))
        f.write("""("{}", "{}"),\n""".format(row["new_sentence"], row["base_sentence"]))

        if (i+1) % 5 == 0:
            print(i)
            f.write("{}\n".format(i))
          

("An ornate vintage clock with several faces. ", "An ornate quartz clock with several faces. "),
("An ornate quartz clock with several faces. ", "An ornate vintage clock with several faces. "),
("A elephant that is standing in the dirt.", "A tree that is standing in the dirt."),
("A tree that is standing in the dirt.", "A elephant that is standing in the dirt."),
("A clock mounted with chains outside a building", "A protester mounted with chains outside a building"),
("A protester mounted with chains outside a building", "A clock mounted with chains outside a building"),
("a newly wed couple celebrating with a toast", "a newly wed couple celebrating with a baby"),
("a newly wed couple celebrating with a baby", "a newly wed couple celebrating with a toast"),
("Two people are snowboarding down a hill fast", "Two people are snowboarding down a mountain fast"),
("Two people are snowboarding down a mountain fast", "Two people are snowboarding down a hill fast"),
4
("a worn wooden bench bene

In [57]:
for_csv = {
    "input_text":[],
    "target_text":[]
}

for i, row in df2.iterrows():
    for_csv["input_text"].append(row["base_sentence"])
    for_csv["target_text"].append(row["new_sentence"])

    for_csv["input_text"].append(row["new_sentence"])
    for_csv["target_text"].append(row["base_sentence"])


df3 = pd.DataFrame(for_csv)
df3.to_csv("COCO_test_pairs.csv", index=False)