In [1]:
from tqdm import tqdm
import json
import os
os.environ["MODEL_DIR"] = '../model'
import nlpaug.augmenter.word as naw
from multiprocessing import Pool

IMPORTED SUCCESSFULLY!


In [13]:
ins_aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action='insert')
sub_aug = naw.ContextualWordEmbsAug(
    model_path='bert-base-uncased', action='substitute')
sub_aug_syn = naw.SynonymAug(aug_src='ppdb', model_path='/mnt/disks/disk1/nlpaug/ppdb/ppdb-2.0-tldr')

In [20]:
text = 'woman taking a picture of someone standing behind a sculpture and a child pushing another woman towards the sculpture'
for sentence in sub_aug.augment(text, n = 20):
    print (sentence)
print ('---')
for sentence in sub_aug_syn.augment(text, n = 20):
    print (sentence)

woman taking the picture of someone standing behind the sculpture and a man pushing another woman towards us .
woman taking the picture of someone standing in one car and a child pushing another woman towards the .
now taking a view of joseph standing behind a table and another child pushing another woman towards the sculpture
photo - a picture of someone standing behind a sculpture and his child pushing a woman around the sculpture
woman - a picture of someone walking behind the sculpture and one child pushing another woman towards another sculpture
woman taking the picture ; someone standing over a sculpture and a child pushing another girl towards a sculpture
woman had a picture of someone standing behind a sculpture and a man following another woman into the .
woman drawing a picture of someone sitting behind a sculpture ... one child pushing another woman towards a sculpture
woman taking the picture of someone standing behind a sculpture . a band . another woman around the sculptu

In [30]:
def get_max_sent_id(dataset):
    max_sent_id = -1
    for image in captions['images']:
        for sentence in image['sentences']:
            max_sent_id = max(max_sent_id, sentence['sentid'])
    return max_sent_id


def augment_sentence(sentence, ins_aug, sub_aug):
    all_variants = []
    text = ' '.join(sentence['tokens'])
    augmented_variants = ins_aug.augment(text, n=5)
    more_variants = sub_aug.augment(text, n=5)
    augmented_variants += more_variants
    all_variants += augmented_variants
    return all_variants


def construct_dataset_sentence(sentence, base_sentence, max_id):
    base_sentence['tokens'] = sentence.split()
    base_sentence['raw'] = sentence
    base_sentence['sentid'] = max_id + 1
    return base_sentence


def augment_dataset(dataset, ins_aug, sub_aug):
    max_sent_id = get_max_sent_id(dataset)
    
    for image in tqdm(dataset['images']):
        new_image_sentences = []
        new_sentids = []
        for sentence in image['sentences']:
            new_image_sentences.append(sentence)
            new_sentids.append(sentence['sentid'])
            aug_variants = augment_sentence(sentence, ins_aug, sub_aug)
            for variant in aug_variants:
                dataset_sentence = construct_dataset_sentence(variant, sentence.copy(), max_sent_id)
                new_image_sentences.append(dataset_sentence)
                new_sentids.append(dataset_sentence['sentid'])
                max_sent_id += 1
        image['sentences'] = new_image_sentences
        image['sentids'] = new_sentids
    return dataset

In [31]:
captions = {}
with open('/mnt/disks/disk1/COCO/dataset_coco.json') as f:
    captions = json.load(f)

In [29]:
coco_new = augment_dataset(captions, ins_aug, sub_aug)



  0%|          | 0/123287 [00:00<?, ?it/s][A[A

  0%|          | 1/123287 [04:36<9456:26:51, 276.13s/it][A[A

  0%|          | 17/123287 [04:55<6630:47:10, 193.65s/it][A[A

  0%|          | 23/123287 [04:57<4644:45:47, 135.65s/it][A[A

  0%|          | 27/123287 [06:02<3418:05:55, 99.83s/it] [A[A

  0%|          | 31/123287 [06:06<2404:36:47, 70.23s/it][A[A

  0%|          | 32/123287 [06:38<2004:55:03, 58.56s/it][A[A

  0%|          | 33/123287 [06:44<1465:27:13, 42.80s/it][A[A

  0%|          | 34/123287 [06:48<1069:23:19, 31.23s/it][A[A

  0%|          | 40/123287 [06:59<766:59:23, 22.40s/it] [A[A

  0%|          | 41/123287 [07:13<686:47:05, 20.06s/it][A[A

  0%|          | 43/123287 [08:14<793:54:52, 23.19s/it][A[A

  0%|          | 46/123287 [08:23<586:47:12, 17.14s/it][A[A

  0%|          | 47/123287 [08:42<602:50:27, 17.61s/it][A[A

  0%|          | 49/123287 [08:43<428:44:59, 12.52s/it][A[A

  0%|          | 50/123287 [08:53<400:07:44, 11.69s/it

KeyboardInterrupt: 

In [32]:
mini_coco = captions.copy()
mini_coco['images'] = mini_coco['images'][:32]

In [33]:
mini_coco_new = augment_dataset(mini_coco, ins_aug, sub_aug)




  0%|          | 0/32 [00:00<?, ?it/s][A[A[A


  3%|▎         | 1/32 [00:08<04:36,  8.91s/it][A[A[A


  6%|▋         | 2/32 [00:14<03:55,  7.85s/it][A[A[A


  9%|▉         | 3/32 [00:19<03:26,  7.13s/it][A[A[A


 12%|█▎        | 4/32 [00:23<02:52,  6.15s/it][A[A[A


 16%|█▌        | 5/32 [00:27<02:30,  5.58s/it][A[A[A


 19%|█▉        | 6/32 [00:33<02:23,  5.51s/it][A[A[A


 22%|██▏       | 7/32 [00:37<02:05,  5.03s/it][A[A[A


 25%|██▌       | 8/32 [00:42<02:04,  5.19s/it][A[A[A


 28%|██▊       | 9/32 [00:47<01:53,  4.93s/it][A[A[A


 31%|███▏      | 10/32 [00:50<01:41,  4.63s/it][A[A[A


 34%|███▍      | 11/32 [00:55<01:35,  4.53s/it][A[A[A


 38%|███▊      | 12/32 [00:59<01:31,  4.59s/it][A[A[A


 41%|████      | 13/32 [01:03<01:23,  4.38s/it][A[A[A


 44%|████▍     | 14/32 [01:07<01:17,  4.28s/it][A[A[A


 47%|████▋     | 15/32 [01:12<01:15,  4.46s/it][A[A[A


 50%|█████     | 16/32 [01:16<01:08,  4.27s/it][A[A[A


 53%|█████▎ 