In [2]:
import os
import json
import numpy as np
import torch
import torch.nn as nn
from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torchvision.transforms import v2
import cv2
from torchvision.models import (densenet121, DenseNet121_Weights,
                                densenet161, DenseNet161_Weights,
                                resnet50, ResNet50_Weights,
                                resnet152, ResNet152_Weights, 
                                vgg19, VGG19_Weights)
from pycocotools.coco import COCO
from torch.utils.data import Dataset, DataLoader, Subset
from torchvision.datasets import CocoCaptions

  check_for_updates()


In [3]:
os.getcwd()

'c:\\Users\\Srijan\\Desktop\\Srijan\\seq2seq-demo\\image_captioning\\cnn_lstm_attention'

In [4]:
train_root_img = "C:\\Users\\Srijan\\Desktop\\Srijan\\seq2seq-demo\\image_captioning\\COCO2014\\train2014"
val_root_img = "C:\\Users\\Srijan\\Desktop\\Srijan\\seq2seq-demo\\image_captioning\\COCO2014\\val2014"
train_captions = "C:\\Users\\Srijan\\Desktop\\Srijan\\seq2seq-demo\\image_captioning\\COCO2014\\annotations_trainval2014\\annotations\\captions_train2014.json"
val_captions = "C:\\Users\\Srijan\\Desktop\\Srijan\\seq2seq-demo\\image_captioning\\COCO2014\\annotations_trainval2014\\annotations\\captions_val2014.json"

In [5]:
trans_album = {
    "train": A.Compose([
        A.Resize(224, 224, interpolation=cv2.INTER_AREA),
        A.RandomRotate90(p=0.5),
        A.HorizontalFlip(p=0.5),
        A.VerticalFlip(p=0.5),
        A.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        ),
        A.pytorch.ToTensorV2()], p=1.
    ),
    "test": A.Compose([
        A.Resize(224, 224, interpolation=cv2.INTER_AREA),
        A.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
        ),
        A.pytorch.ToTensorV2()], p=1.
    )
}

trans_v2 = v2.Compose([
    v2.Resize((224, 224)),
    v2.ToImage(),
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
        )
])

In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [7]:
img0 = Image.open(os.path.join(train_root_img, os.listdir(train_root_img)[0])).convert('RGB')

In [8]:
resnet152_net = resnet152(weights = ResNet152_Weights.DEFAULT)
resnet152_net = nn.Sequential(*list(resnet152_net.children())[:-2]).to(device)
resnet152_dim = 2048

In [9]:
img0_trans_album = trans_album["train"](image = np.array(img0, dtype = np.float32))["image"].to(device).unsqueeze(0)
img0_trans_v2 = trans_v2(img0).to(device).unsqueeze(0)
img0_trans_album.size(), img0_trans_v2.size()

(torch.Size([1, 3, 224, 224]), torch.Size([1, 3, 224, 224]))

In [10]:
img0_res152 = resnet152_net(img0_trans_album)
img0_res152.size()

torch.Size([1, 2048, 7, 7])

In [11]:
img0_res152 = img0_res152.permute(0, 2, 3, 1)
img0_res152.size()

torch.Size([1, 7, 7, 2048])

In [12]:
img0_res152 = img0_res152.view(img0_res152.size(0), -1, img0_res152.size(-1))
img0_res152.size()

torch.Size([1, 49, 2048])

In [13]:
type(trans_album["train"]), type(trans_v2)

(albumentations.core.composition.Compose,
 torchvision.transforms.v2._container.Compose)

In [14]:
type(trans_album["train"]) == A.core.composition.Compose, type(trans_v2) == v2._container.Compose

(True, True)

In [15]:
def get_coco_dataloader(
    transform,
    root: str,
    annFile: str,
    batch_size: int = 32,
    num_workers: int = 4
):
    """
    Create a DataLoader for COCO Captions using torchvision's built-in dataset.
    
    Args:
        root: Path to the COCO images directory
        annFile: Path to the annotations json file
        batch_size: Number of samples per batch
        num_workers: Number of worker processes for data loading
    """
    # Define transforms
    transform = v2.Compose([
        v2.Resize((224, 224)),
        v2.ToImage(),
        v2.ToDtype(torch.float32, scale=True),
        v2.Normalize(
            mean=[0.485, 0.456, 0.406],
            std=[0.229, 0.224, 0.225]
            )
    ])
    # Create dataset
    dataset = CocoCaptions(
        root=root,
        annFile=annFile,
        transform=transform
    )
    
    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=True,
        num_workers=num_workers,
        pin_memory=True
    )
    
    return dataloader

In [16]:
val_captions

'C:\\Users\\Srijan\\Desktop\\Srijan\\seq2seq-demo\\image_captioning\\COCO2014\\annotations_trainval2014\\annotations\\captions_val2014.json'

In [17]:
root = val_root_img
annFile = val_captions
dataloader = get_coco_dataloader(root = root, annFile=annFile, transform=trans_v2)

# Each item will be (image, captions) where captions is a list of strings
for images, captions in dataloader:
    # images: tensor of shape [batch_size, 3, 224, 224]
    # captions: list of lists, where each inner list contains 5 captions for one image
    print(images.size())
    print(captions)
    break


loading annotations into memory...
Done (t=0.27s)
creating index...
index created!
torch.Size([32, 3, 224, 224])
[['Two people standing in a kitchen looking around. ', 'A woman covers her mouth as she is presented a birthday cake.  ', 'A white plate on a table topped with a sandwich.', 'A woman laying on a bathroom floor next to a toilet.', 'A sandwich that is sitting on a plate.', 'A young woman is walking to the ocean with her surfboard.', 'A dog catches a firsbee in the middle of the air. ', 'I am unable to see an image above.', 'An older gentleman in a white shirt and black bow tie.', 'A man riding a red surfboard on a wave in the ocean.', 'A woman holding a roasting pan filled with a turkey in an open oven.', 'A little boy walking down a street next to young men skateboarding on it.', 'An elephant reaches its trunk over a fence toward a kid. ', 'Large clock sitting on the side of a large building. ', 'A table topped with a colorful table cloth with food on top of it.', 'a tabby an

In [18]:
len(captions), len(captions[0])

(5, 32)

In [43]:
[caption for caption in captions[2]]

['Two people looking at a large kitchen ',
 'A person holding a cake with lit candles in front of a girl in a dark room.',
 'A half-eaten sandwich and coffee on a table with a camera.',
 'A woman laying down on a bathroom floor by a trash can. ',
 'A sandwich that has a type of meat on it.',
 'A girl on the beach walking towards the water with a surf board.',
 'A dog catches the frisbee while playing outside.',
 'A time lapse photo of a skier skiing down a hill.',
 'An older man with a white shirt and bow tie is smiling.',
 'A person is hanging from a rope while riding a surfboard.',
 'A woman is placing a large bird into the oven.',
 'Three young people go skateboarding down a street. ',
 'A girl standing in front of an elephant.',
 'A large building with statues at the top of it . ',
 'A table with ham, turkey and other various food entrees.',
 'An orange cat is lounging on a gray couch.',
 'A women who has a plate with pizza on it.',
 'Motorcyclist on chromed motorcycle rounding a c

In [19]:
captions[0][0], captions[1][0], captions[2][0], captions[3][0], captions[4][0]

('Two people standing in a kitchen looking around. ',
 'A couple of men are standing in a kitchen',
 'Two people looking at a large kitchen ',
 'An elderly man and woman looking around in a kitchen.',
 'Two people standing close to each other while standing in a kitchen.')

In [None]:
# import matplotlib.pyplot as plt

In [None]:
# image = images[0]
# print(type(image))
# mean = torch.tensor([0.485, 0.456, 0.406])
# std = torch.tensor([0.229, 0.224, 0.225])
# image = (image.permute(1, 2, 0) * std.view(1, 1, 3) + mean.view(1, 1, 3)).clamp(0, 1).byte().numpy()
# image = Image.fromarray(image, mode="RGB")

<class 'torch.Tensor'>


In [None]:
# Image.fromarray(images[0].permute(1, 2, 0).numpy(), mode='RGB').show()
# print("COCO Caption: " + ", ".join(captions[0]))


COCO Caption: Two people standing in a kitchen looking around. , A woman covers her mouth as she is presented a birthday cake.  , A white plate on a table topped with a sandwich., A woman laying on a bathroom floor next to a toilet., A sandwich that is sitting on a plate., A young woman is walking to the ocean with her surfboard., A dog catches a firsbee in the middle of the air. , I am unable to see an image above., An older gentleman in a white shirt and black bow tie., A man riding a red surfboard on a wave in the ocean., A woman holding a roasting pan filled with a turkey in an open oven., A little boy walking down a street next to young men skateboarding on it., An elephant reaches its trunk over a fence toward a kid. , Large clock sitting on the side of a large building. , A table topped with a colorful table cloth with food on top of it., a tabby and black cat lounging on a sofa , A woman sitting at a table with a  pizza in front of her., A man in a jean jacket riding a motorcyc

In [34]:
batch0_enc = resnet152_net(images.to(device))
batch0_enc.size()

torch.Size([32, 2048, 7, 7])

In [35]:
batch0_enc = batch0_enc.permute(0, 2, 3, 1)
batch0_enc.size()

torch.Size([32, 7, 7, 2048])

In [36]:
batch0_enc = batch0_enc.view(batch0_enc.size(0), -1, batch0_enc.size(-1))
batch0_enc.size()

torch.Size([32, 49, 2048])

In [44]:
max(len(caption) for sublist in captions for caption in sublist)

180

In [46]:
max_len_caption, max_indices = max(
    ((caption, (i, j)) for i, sublist in enumerate(captions) for j, caption in enumerate(sublist)),
    key=lambda x: len(x[0])
)
len(max_len_caption), max_len_caption, max_indices

(180,
 'The words "Connect_Anywhere" cover a black and white photo of an unattended backpack left beside the window of an airport terminal overlooking the tarmac and a commercial airplane.',
 (2, 28))

In [47]:
max([len(caption) for caption in captions[0]]) - 1

81