In [134]:
from pycocotools.coco import COCO
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from collections import Counter
import nltk
import json
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import string
tqdm.pandas()

In [135]:
path = "../data/2017/captions.json"
data = json.load(open(path, 'r'))
df = pd.DataFrame.from_records(data['images'])
df.drop(columns=['file_name', 'license', 'date_captured', 'flickr_url'], inplace=True)
print(len(df))
df.head()

5000


Unnamed: 0,coco_url,height,width,id
0,http://images.cocodataset.org/val2017/00000039...,427,640,397133
1,http://images.cocodataset.org/val2017/00000003...,230,352,37777
2,http://images.cocodataset.org/val2017/00000025...,428,640,252219
3,http://images.cocodataset.org/val2017/00000008...,480,640,87038
4,http://images.cocodataset.org/val2017/00000017...,388,640,174482


In [136]:
def get_captions(image_id):
    captions = coco_captions.loadAnns(coco_captions.getAnnIds(imgIds=image_id))
    captions = ' '.join(x['caption'] for x in captions)
    return captions

In [137]:
coco_captions = COCO(path)
df['captions'] = df['id'].progress_apply(lambda x: get_captions(x))

loading annotations into memory...
Done (t=0.06s)
creating index...
index created!


HBox(children=(IntProgress(value=0, max=5000), HTML(value='')))




In [138]:
df.head()

Unnamed: 0,coco_url,height,width,id,captions
0,http://images.cocodataset.org/val2017/00000039...,427,640,397133,A man is in a kitchen making pizzas. Man in ap...
1,http://images.cocodataset.org/val2017/00000003...,230,352,37777,The dining table near the kitchen has a bowl o...
2,http://images.cocodataset.org/val2017/00000025...,428,640,252219,a person with a shopping cart on a city street...
3,http://images.cocodataset.org/val2017/00000008...,480,640,87038,A person on a skateboard and bike at a skate p...
4,http://images.cocodataset.org/val2017/00000017...,388,640,174482,a blue bike parked on a side walk A bicycle i...


In [156]:
cond1 = df['height'] < 0.8*df['width']
cond2 = df['height'] > 0.6*df['width']
df = df[cond1 & cond2]

In [157]:
print(len(df))
df.head()

3242


Unnamed: 0,coco_url,height,width,id,captions
0,http://images.cocodataset.org/val2017/00000039...,427,640,397133,A man is in a kitchen making pizzas. Man in ap...
1,http://images.cocodataset.org/val2017/00000003...,230,352,37777,The dining table near the kitchen has a bowl o...
2,http://images.cocodataset.org/val2017/00000025...,428,640,252219,a person with a shopping cart on a city street...
3,http://images.cocodataset.org/val2017/00000008...,480,640,87038,A person on a skateboard and bike at a skate p...
4,http://images.cocodataset.org/val2017/00000017...,388,640,174482,a blue bike parked on a side walk A bicycle i...


In [158]:
df['captions'] = df['captions'].str.lower()

In [159]:
df.head()

Unnamed: 0,coco_url,height,width,id,captions
0,http://images.cocodataset.org/val2017/00000039...,427,640,397133,a man is in a kitchen making pizzas. man in ap...
1,http://images.cocodataset.org/val2017/00000003...,230,352,37777,the dining table near the kitchen has a bowl o...
2,http://images.cocodataset.org/val2017/00000025...,428,640,252219,a person with a shopping cart on a city street...
3,http://images.cocodataset.org/val2017/00000008...,480,640,87038,a person on a skateboard and bike at a skate p...
4,http://images.cocodataset.org/val2017/00000017...,388,640,174482,a blue bike parked on a side walk a bicycle i...


In [160]:
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df['captions'] = df['captions'].progress_apply(lambda text: remove_punctuation(text))

HBox(children=(IntProgress(value=0, max=3242), HTML(value='')))




In [161]:
df.head()

Unnamed: 0,coco_url,height,width,id,captions
0,http://images.cocodataset.org/val2017/00000039...,427,640,397133,a man is in a kitchen making pizzas man in apr...
1,http://images.cocodataset.org/val2017/00000003...,230,352,37777,the dining table near the kitchen has a bowl o...
2,http://images.cocodataset.org/val2017/00000025...,428,640,252219,a person with a shopping cart on a city street...
3,http://images.cocodataset.org/val2017/00000008...,480,640,87038,a person on a skateboard and bike at a skate p...
4,http://images.cocodataset.org/val2017/00000017...,388,640,174482,a blue bike parked on a side walk a bicycle i...


In [162]:
stop_words = set(stopwords.words('english'))
def remove_stop_words(text):
    return ' '.join([word for word in str(text).split() if word not in stop_words])

df['captions'] = df['captions'].progress_apply(lambda text: remove_stop_words(text))

HBox(children=(IntProgress(value=0, max=3242), HTML(value='')))




In [163]:
df.head()

Unnamed: 0,coco_url,height,width,id,captions
0,http://images.cocodataset.org/val2017/00000039...,427,640,397133,man kitchen making pizzas man apron standing f...
1,http://images.cocodataset.org/val2017/00000003...,230,352,37777,dining table near kitchen bowl fruit small kit...
2,http://images.cocodataset.org/val2017/00000025...,428,640,252219,person shopping cart city street city dwellers...
3,http://images.cocodataset.org/val2017/00000008...,480,640,87038,person skateboard bike skate park man skateboa...
4,http://images.cocodataset.org/val2017/00000017...,388,640,174482,blue bike parked side walk bicycle chained fix...


In [183]:
import matplotlib.pyplot as plt
%matplotlib inline
d = Counter(df['captions'].apply(lambda x: len(x.split(' '))))
sorted_keys = sorted(d)
for k in sorted(d):
    print(k, d[k])

19 4
20 10
21 23
22 52
23 92
24 168
25 239
26 291
27 343
28 348
29 352
30 300
31 288
32 200
33 155
34 107
35 74
36 62
37 44
38 32
39 18
40 14
41 7
42 3
43 6
44 1
45 5
46 2
50 1
55 1


In [164]:
df.to_csv('../data/2017/captions.csv', index=False)