# Loading the dataset

In [1]:
import pandas as pd
import numpy as np
import string
from torchvision.datasets import CocoCaptions
from torchvision.transforms import transforms

In [2]:
# Define paths for datasets
data_type = 'train2014' # or 'val2014' or 'test2014'
captions_file = f'archive/captions/annotations/captions_{data_type}.json'
image_dir = f'archive/images/{data_type}'

# Apply a series of Augmentations to data
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

dataset = CocoCaptions(root=image_dir, annFile=captions_file, transform=transform)

loading annotations into memory...
Done (t=2.13s)
creating index...
index created!


In [3]:
data = []
for idx, (img, captions) in enumerate(dataset):
    for caption in captions:
        data.append({'caption' : caption})
        Number_of_entries = 20
        if(idx == Number_of_entries):
            break
    break

In [4]:
dataset

Dataset CocoCaptions
    Number of datapoints: 82783
    Root location: archive/images/train2014
    StandardTransform
Transform: Compose(
               Resize(size=(224, 224), interpolation=bilinear, max_size=None, antialias=warn)
               ToTensor()
           )

In [5]:
dataset[0] # first image, returns a tuple (tensor, list)

(tensor([[[0.0078, 0.0039, 0.0039,  ..., 0.5608, 0.5412, 0.5255],
          [0.0039, 0.0039, 0.0039,  ..., 0.5647, 0.5569, 0.5373],
          [0.0000, 0.0000, 0.0039,  ..., 0.5647, 0.5529, 0.5412],
          ...,
          [0.0314, 0.0588, 0.0902,  ..., 0.2314, 0.2588, 0.1373],
          [0.0235, 0.0510, 0.0824,  ..., 0.1686, 0.0824, 0.0275],
          [0.0235, 0.0392, 0.0627,  ..., 0.0392, 0.0196, 0.0196]],
 
         [[0.0863, 0.0863, 0.0902,  ..., 0.6980, 0.6863, 0.6706],
          [0.0824, 0.0863, 0.0941,  ..., 0.7020, 0.6980, 0.6824],
          [0.0902, 0.0980, 0.1020,  ..., 0.7098, 0.7020, 0.6941],
          ...,
          [0.0078, 0.0431, 0.0588,  ..., 0.3333, 0.3765, 0.2235],
          [0.0039, 0.0275, 0.0588,  ..., 0.2431, 0.1451, 0.0588],
          [0.0039, 0.0157, 0.0392,  ..., 0.0863, 0.0431, 0.0196]],
 
         [[0.4314, 0.4431, 0.4549,  ..., 0.8000, 0.7882, 0.7725],
          [0.4353, 0.4471, 0.4549,  ..., 0.8039, 0.8000, 0.7843],
          [0.4431, 0.4471, 0.4549,  ...,

In [6]:
type(dataset)

torchvision.datasets.coco.CocoCaptions

In [7]:
dataset[0][1] # returns list of captions for the first image

['Closeup of bins of food that include broccoli and bread.',
 'A meal is presented in brightly colored plastic trays.',
 'there are containers filled with different kinds of foods',
 'Colorful dishes holding meat, vegetables, fruit, and bread.',
 'A bunch of trays that have different food.']

In [8]:
dataset[0][1][2]

'there are containers filled with different kinds of foods'

In [None]:
# creating the master list:

master_list = []
translator = str.maketrans('', '', string.punctuation)
for i in range(len(dataset)):
    current_caption = dataset[i][1]
    for k in range(len(current_caption)):
        current_string = current_caption[k]
        current_string = current_string.translate(translator) # remove punctuation symbols from current_string
        master_list.extend(current_string.split(" "))

In [13]:
master_list

['Closeup',
 'of',
 'bins',
 'of',
 'food',
 'that',
 'include',
 'broccoli',
 'and',
 'bread',
 'A',
 'meal',
 'is',
 'presented',
 'in',
 'brightly',
 'colored',
 'plastic',
 'trays',
 'there',
 'are',
 'containers',
 'filled',
 'with',
 'different',
 'kinds',
 'of',
 'foods',
 'Colorful',
 'dishes',
 'holding',
 'meat',
 'vegetables',
 'fruit',
 'and',
 'bread',
 'A',
 'bunch',
 'of',
 'trays',
 'that',
 'have',
 'different',
 'food',
 'A',
 'giraffe',
 'eating',
 'food',
 'from',
 'the',
 'top',
 'of',
 'the',
 'tree',
 'A',
 'giraffe',
 'standing',
 'up',
 'nearby',
 'a',
 'tree',
 '',
 'A',
 'giraffe',
 'mother',
 'with',
 'its',
 'baby',
 'in',
 'the',
 'forest',
 'Two',
 'giraffes',
 'standing',
 'in',
 'a',
 'tree',
 'filled',
 'area',
 'A',
 'giraffe',
 'standing',
 'next',
 'to',
 'a',
 'forest',
 'filled',
 'with',
 'trees',
 'A',
 'flower',
 'vase',
 'is',
 'sitting',
 'on',
 'a',
 'porch',
 'stand',
 'White',
 'vase',
 'with',
 'different',
 'colored',
 'flowers',
 'sitti

# General quantitative analysis

### Number of images in the dataset

In [14]:
print(f"Number of images in the dataset = {len(dataset)}")

Number of images in the dataset = 82783


### Number of captions

In [19]:
caption_to_image_ratio = len(dataset[0][1])
total_captions = len(dataset[0][1]) * len(dataset)
word_to_image_ratio = len(master_list) / len(dataset) # first load the full master_list in memory

print(f"Caption to image ratio = {caption_to_image_ratio}")
print(f"Total number of captions in the dataset = {total_captions}")
print(f"Total number of words in all captions = {len(master_list)}")
print(f"Average number of words that describe an image = {word_to_image_ratio}")

Caption to image ratio = 5
Total number of captions in the dataset = 413915
Total number of words in all captions = 71538
Average number of words that describe an image = 0.8641629320029475


### Number of unique words in the dataset

In [22]:
num_of_unique_words = len(set(word.lower() for word in master_list)) # number of unique words in the master_list

print(f"Total number of unique words in dataset = {num_of_unique_words}")

Total number of unique words in dataset = 3962
