In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torchvision import transforms
import sys
from pycocotools.coco import COCO
import math
import torch.utils.data as data
import numpy as np
import os
import requests
import time

from utils import train, validate, save_epoch, early_stopping
from coco_dataloader import get_loader
from model import ResNetEncoder, RNNDecoder

batch_size=32
vocab_threshold = 5
load_vocab = True
embedding_size=256
hidden_size=512


[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:


# Define a transform to pre-process the training images
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Define a transform to pre-process the validation images
transform_val = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.CenterCrop(224),                      # get 224x224 crop from the center
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])



In [4]:
# Build data loader, applying the transforms
train_loader = get_loader(transform=transform_train,
                         mode='train',
                         sample_size = 100000,
                         batch_size=batch_size,
                         threshold=vocab_threshold,
                         load_vocab=load_vocab)


Loaded pre-built vocab file
loading annotations into memory...
Done (t=0.72s)
creating index...


  1%|          | 630/100000 [00:00<00:15, 6296.64it/s]

index created!
IDS 414113


100%|██████████| 100000/100000 [00:13<00:00, 7431.24it/s]


In [5]:
val_loader = get_loader(transform=transform_val,
                         mode='val',
#                          sample_size = 30000,
                         batch_size=batch_size,
                         threshold=vocab_threshold,
                         load_vocab=load_vocab)




Loaded pre-built vocab file
loading annotations into memory...
Done (t=0.36s)
creating index...


  2%|▏         | 715/30000 [00:00<00:04, 7141.99it/s]

index created!
IDS 202654


100%|██████████| 30000/30000 [00:04<00:00, 7265.18it/s]


In [6]:
print(torch.cuda.is_available())
num_epochs=3

False


In [7]:
# The size of the vocabulary
vocab_size = len(train_loader.dataset.vocab)

# Initialize the encoder and decoder
encoder = ResNetEncoder(embedding_size)
decoder = RNNDecoder(embedding_size, hidden_size, vocab_size)

# Move models to GPU if CUDA is available
if torch.cuda.is_available():
    encoder.cuda()
    decoder.cuda()

Downloading: "https://download.pytorch.org/models/resnet50-19c8e357.pth" to /home/ec2-user/.cache/torch/checkpoints/resnet50-19c8e357.pth


HBox(children=(FloatProgress(value=0.0, max=102502400.0), HTML(value='')))




In [8]:
# Define the loss function
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# Specify the learnable parameters of the model
params = list(decoder.parameters()) + list(encoder.embed.parameters()) + list(encoder.bn.parameters())

# Define the optimizer
optimizer = torch.optim.Adam(params=params, lr=0.001)

In [9]:
total_train_step = math.ceil(len(train_loader.dataset.caption_lengths) / train_loader.batch_sampler.batch_size)
total_val_step = math.ceil(len(val_loader.dataset.caption_lengths) / val_loader.batch_sampler.batch_size)
print ("Number of training steps:", total_train_step)
print ("Number of validation steps:", total_val_step)

Number of training steps: 3125
Number of validation steps: 938


In [None]:
# Keep track of train and validation losses and validation Bleu-4 scores by epoch
train_losses = []
val_losses = []
val_bleus = []
# Keep track of the current best validation Bleu score
best_val_bleu = float("-INF")

start_time = time.time()
for epoch in range(1, num_epochs + 1):
    train_loss = train(train_loader, encoder, decoder, criterion, optimizer, 
                       vocab_size, epoch, total_train_step)
    train_losses.append(train_loss)
    val_loss, val_bleu = validate(val_loader, encoder, decoder, criterion,
                                  train_loader.dataset.vocab, epoch, total_val_step)
    val_losses.append(val_loss)
    val_bleus.append(val_bleu)
    if val_bleu > best_val_bleu:
        print ("Validation Bleu-4 improved from {:0.4f} to {:0.4f}, saving model to best-model.pkl".
               format(best_val_bleu, val_bleu))
        best_val_bleu = val_bleu
        filename = os.path.join("./models", "best-model.pkl")
        save_epoch(filename, encoder, decoder, optimizer, train_losses, val_losses, 
                   val_bleu, val_bleus, epoch)
    else:
        print ("Validation Bleu-4 did not improve, saving model to model-{}.pkl".format(epoch))
    # Save the entire model anyway, regardless of being the best model so far or not
    filename = os.path.join("./models", "model-{}.pkl".format(epoch))
    save_epoch(filename, encoder, decoder, optimizer, train_losses, val_losses, 
               val_bleu, val_bleus, epoch)
    print ("Epoch [%d/%d] took %ds" % (epoch, num_epochs, time.time() - start_time))
    if epoch > 5:
        # Stop if the validation Bleu doesn't improve for 3 epochs
        if early_stopping(val_bleus, 3):
             break
    start_time = time.time()

Epoch 1,Step [1/3125],8.449902296066284s, Loss:8.9115
Epoch 1,Step [2/3125],17.32950258255005s, Loss:8.8106
Epoch 1,Step [3/3125],25.44389057159424s, Loss:8.6398
Epoch 1,Step [4/3125],33.58589696884155s, Loss:8.3353


In [None]:
# Load the last checkpoints
rem_epochs = 1
checkpoint = torch.load(os.path.join('./models','all_set' 'train-model-1-10500.pkl'))

# Load the pre-trained weights
encoder.load_state_dict(checkpoint['encoder'])
decoder.load_state_dict(checkpoint['decoder'])
optimizer.load_state_dict(checkpoint['optimizer'])

# Load start_loss from checkpoint if in the middle of training process; otherwise, comment it out
start_loss = checkpoint['total_loss']
# Reset start_loss to 0.0 if starting a new epoch; otherwise comment it out
#start_loss = 0.0

# Load epoch. Add 1 if we start a new epoch
epoch = checkpoint['epoch']
# Load start_step from checkpoint if in the middle of training process; otherwise, comment it out
start_step = checkpoint['train_step'] + 1
# Reset start_step to 1 if starting a new epoch; otherwise comment it out
#start_step = 1

# # Train 1 epoch at a time due to very long training time
# train_loss = train(train_loader, encoder, decoder, criterion, optimizer, 
#                    vocab_size, epoch, total_train_step, start_step, start_loss)
start_time = time.time()
for epoch in range(epoch, epoch+rem_epoch + 1):
    train_loss = train(train_loader, encoder, decoder, criterion, optimizer, 
                       vocab_size, epoch, total_train_step)
    train_losses.append(train_loss)
    val_loss, val_bleu = validate(val_loader, encoder, decoder, criterion,
                                  train_loader.dataset.vocab, epoch, total_val_step)
    val_losses.append(val_loss)
    val_bleus.append(val_bleu)
    if val_bleu > best_val_bleu:
        print ("Validation Bleu-4 improved from {:0.4f} to {:0.4f}, saving model to best-model.pkl".
               format(best_val_bleu, val_bleu))
        best_val_bleu = val_bleu
        filename = os.path.join("./models", "best-model.pkl")
        save_epoch(filename, encoder, decoder, optimizer, train_losses, val_losses, 
                   val_bleu, val_bleus, epoch)
    else:
        print ("Validation Bleu-4 did not improve, saving model to model-{}.pkl".format(epoch))
    # Save the entire model anyway, regardless of being the best model so far or not
    filename = os.path.join("./models", "model-{}.pkl".format(epoch))
    save_epoch(filename, encoder, decoder, optimizer, train_losses, val_losses, 
               val_bleu, val_bleus, epoch)
    print ("Epoch [%d/%d] took %ds" % (epoch, num_epochs, time.time() - start_time))
    if epoch > 5:
        # Stop if the validation Bleu doesn't improve for 3 epochs
        if early_stopping(val_bleus, 3):
             break
    start_time = time.time()



Epoch 1,Step [5101/12942],5.198509693145752s, Loss:2.1459
Epoch 1,Step [5102/12942],10.339481592178345s, Loss:2.0999
Epoch 1,Step [5103/12942],15.504561424255371s, Loss:2.4344
Epoch 1,Step [5104/12942],20.58127999305725s, Loss:2.0875
Epoch 1,Step [5105/12942],25.795966148376465s, Loss:2.7829
Epoch 1,Step [5106/12942],30.98399567604065s, Loss:2.2643
Epoch 1,Step [5107/12942],36.169461488723755s, Loss:2.3524
Epoch 1,Step [5108/12942],42.50917339324951s, Loss:2.6932
Epoch 1,Step [5109/12942],51.55997371673584s, Loss:2.6480
Epoch 1,Step [5110/12942],77.7855772972107s, Loss:2.1126
Epoch 1,Step [5111/12942],110.40236234664917s, Loss:1.9038
Epoch 1,Step [5112/12942],142.48601627349854s, Loss:2.2815
Epoch 1,Step [5113/12942],174.0038137435913s, Loss:1.9853
Epoch 1,Step [5114/12942],205.1396987438202s, Loss:2.1130
Epoch 1,Step [5115/12942],247.2653510570526s, Loss:3.2203
Epoch 1,Step [5116/12942],279.94119811058044s, Loss:2.7299
Epoch 1,Step [5117/12942],296.773446559906s, Loss:2.2835
Epoch 1,S