#### **Encoder-Decoder Model**

In [1]:
#Get the batches of images and captions
import numpy as np
import torch
import torch.utils.data as data
from coco_dataloader import get_loader
from torchvision import transforms



[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:

#define image transform 
transform_img = transforms.Compose([transforms.Resize(256),
                                    transforms.CenterCrop(224),
                                   transforms.RandomHorizontalFlip(),
                                   transforms.ToTensor(),
                                   transforms.Normalize((0.485,0.456,0.406),
                                                        (0.229,0.224,0.225))])

vocab_threshold = 5
batch_size = 10
coco_dataloader = get_loader(transform=transform_img,
                            mode='train',
                            batch_size=batch_size,
#                             threshold=vocab_threshold,
                            load_vocab=True)

indices = coco_dataloader.dataset.get_indices()
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
coco_dataloader.batch_sampler.sampler = new_sampler

#obtain single batch
for batch in coco_dataloader:
    images,captions = batch[0],batch[1]
    break
    
    

Loaded pre-built vocab file
loading annotations into memory...
Done (t=0.79s)
creating index...


  0%|          | 591/414113 [00:00<01:09, 5908.71it/s]

index created!


100%|██████████| 414113/414113 [00:55<00:00, 7478.52it/s]


##### Image Encoder CNN
CNN encoder used pretrained ResNet-50/Inception_V3 model in which top fc is removed,to extract feature vectors from a batch of images.The feature vectors are then flattened and passed to a Linear layer to transform the feature vector to have same size as the embedding size

In [3]:
%load_ext autoreload
%autoreload 2
from model import ResNetEncoder,InceptionEncoder

In [4]:
#ResNet50
embedding_size=256

encoderCNN_ResNet = ResNetEncoder(embedding_size)
print(encoderCNN_ResNet)

if torch.cuda.is_available():
    encoder = encoder.cuda()
    images = images.cuda()

features = encoderCNN_ResNet(images)

f_s = features.shape
assert (f_s[0]==batch_size and f_s[1]==embedding_size)

    

ResNetEncoder(
  (resnet): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (4): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
          (0): Conv2d

In [5]:
features.shape

torch.Size([10, 256])

In [None]:

#define image transform 
transform_img = transforms.Compose([
                                    transforms.Resize(299),
                                    transforms.CenterCrop(299),
                                    transforms.ToTensor(),
                                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
vocab_threshold = 5
batch_size = 2
coco_dataloader = get_loader(transform=transform_img,
                            mode='train',
                            batch_size=batch_size,
#                             threshold=vocab_threshold,
                            load_vocab=True)

indices = coco_dataloader.dataset.get_indices()
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
coco_dataloader.batch_sampler.sampler = new_sampler

#obtain single batch
for batch in coco_dataloader:
    images,captions = batch[0],batch[1]
    break

In [None]:
embedding_size=256

encoderCNN_Inception = InceptionEncoder(embedding_size)
print(encoderCNN_Inception)

if torch.cuda.is_available():
    encoder = encoder.cuda()
    images = images.cuda()

features_iv3 = encoderCNN_Inception(images)

f_s_iv3 = features_iv3.shape
assert (f_s_iv3[0]==batch_size and f_s_iv3[1]==embedding_size)


#### Text Decoder RNN
The decoder RNN model will take the input,feature tensor comprised of embedded image features along with tensor associated to the last batch of captions

In [8]:
%load_ext autoreload
%autoreload 2
from model import RNNDecoder

hidden_size=512
embedding_size=256

vocab_size = len(coco_dataloader.dataset.vocab)

decoderRNN = RNNDecoder(embedding_size,hidden_size,vocab_size)

if torch.cuda.is_available():
    decoder=decoder.gpu()
    captions = captions.gpu()

outputs_r50 = decoderRNN(features,captions)

print(outputs_r50.shape)

out_r50_s = outputs_r50.shape

assert (out_r50_s[0]==batch_size and out_r50_s[1]==captions.shape[1] and out_r50_s[2]==vocab_size)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
torch.Size([10, 12, 8856])
