In [1]:
from vocabulary import Vocabulary
from data_loader import SportDataset
from model import EncoderCNN, DecoderRNN

import torch
import math
import pandas as pd
import numpy as np
import os
import itertools
import sys
import random
import torchvision
import matplotlib.pyplot as plt

from time import time
from PIL import Image

from sklearn import preprocessing
from torch import nn, optim
from torchvision import datasets, transforms
import torch.utils.data as data
from torch.utils.data import Dataset, DataLoader

# The functional module contains helper functions for defining neural network layers as simple functions
import torch.nn.functional as F

In [2]:
## TODO #1: Select appropriate values for the Python variables below.
batch_size = 100          # batch size
vocab_threshold = 5        # minimum word count threshold
vocab_from_file = False    # if True, load existing vocab file
embed_size = 300           # dimensionality of image and word embeddings
hidden_size = 512          # number of features in hidden state of the RNN decoder
num_epochs = 500        # number of training epochs
save_every = 100             # determines frequency of saving model weights
print_every = 5          # determines window for printing average loss
log_file = 'training_log_100500.txt'       # name of file with saved training loss and perplexity

In [3]:
# transformations to be applied on images
transform = transforms.Compose([transforms.Resize((100, 100)),
                                transforms.ToTensor(),
                              transforms.Normalize((0.5,), (0.5,)),
                              ])

In [4]:
image_path = 'Training Images'
caption_file = 'Overall_Training_Captions_csv.csv'
vocab = Vocabulary(image_path, caption_file)

In [5]:
vocab_size = vocab.vocab_size
caption_lengths = vocab.caption_lengths

In [6]:
encoder = EncoderCNN(embed_size)
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)


# Move models to GPU if CUDA is available. 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder.to(device)
decoder.to(device)

# Define the loss function. 
criterion = nn.CrossEntropyLoss().cuda() if torch.cuda.is_available() else nn.CrossEntropyLoss()

# TODO #3: Specify the learnable parameters of the model.
params = list(decoder.parameters()) + list(encoder.embed.parameters())

# TODO #4: Define the optimizer.
optimizer = torch.optim.Adam(params = params, lr = 0.001)

# Set the total number of training steps per epoch.
total_step = math.ceil(len(caption_lengths) / batch_size)

In [7]:
captionInIdx_file = 'image_index.csv'
dir = 'Training Images'
img_files = os.listdir(dir)

def train_path(p): return os.path.join(dir, p)
img_files = list(map(train_path, img_files))
print('Number of images :', len(img_files))

random.shuffle(img_files)
train_files = img_files

Number of images : 1135


In [8]:
# Open the training log file.
f = open(log_file, 'w')

for epoch in range(1, num_epochs+1):
    
    train_ds = SportDataset(train_files, captionInIdx_file, transform, mode = 'train')
    train_dl = DataLoader(train_ds, batch_size = batch_size)
    #print(len(train_ds), len(train_dl))
    
    # shape of training data
    dataiter = iter(train_dl)
    
    for i_step in range(1, total_step+1):

        images, captions = dataiter.next()
        #print(images.shape)
        #print(captions.shape)
        
        # Move batch of images and captions to GPU if CUDA is available.
        images = images.to(device)
        captions = captions.to(device)
        
        # Zero the gradients.
        decoder.zero_grad()
        encoder.zero_grad()
        
        # Pass the inputs through the CNN-RNN model.
        features = encoder(images)
        outputs = decoder(features, captions)
        
        # Calculate the batch loss.
        loss = criterion(outputs.view(-1, vocab_size), captions.view(-1))
        
        # Backward pass.
        loss.backward()
        
        # Update the parameters in the optimizer.
        optimizer.step()
            
        # Get training statistics.
        stats = 'Epoch [%d/%d], Step [%d/%d], Loss: %.4f, Perplexity: %5.4f' % (epoch, num_epochs, i_step, total_step, loss.item(), np.exp(loss.item()))
        
        # Print training statistics (on same line).
        print('\r' + stats, end="")
        sys.stdout.flush()
        
        # Print training statistics to file.
        f.write(stats + '\n')
        f.flush()
        
        # Print training statistics (on different line).
        if i_step % print_every == 0:
            print('\r' + stats)
            
    # Save the weights.
    if epoch % save_every == 0:
        torch.save(decoder.state_dict(), os.path.join('./models_100500', 'decoder-%d.pkl' % epoch))
        torch.save(encoder.state_dict(), os.path.join('./models_100500', 'encoder-%d.pkl' % epoch))

# Close the training log file.
f.close()

Epoch [1/500], Step [5/12], Loss: 4.0095, Perplexity: 55.119237
Epoch [1/500], Step [10/12], Loss: 3.2357, Perplexity: 25.4236
Epoch [2/500], Step [5/12], Loss: 2.4887, Perplexity: 12.04528
Epoch [2/500], Step [10/12], Loss: 2.2805, Perplexity: 9.7811
Epoch [3/500], Step [5/12], Loss: 2.1096, Perplexity: 8.244898
Epoch [3/500], Step [10/12], Loss: 2.0790, Perplexity: 7.9966
Epoch [4/500], Step [5/12], Loss: 1.9606, Perplexity: 7.10380
Epoch [4/500], Step [10/12], Loss: 1.9501, Perplexity: 7.0295
Epoch [5/500], Step [5/12], Loss: 1.8358, Perplexity: 6.27014
Epoch [5/500], Step [10/12], Loss: 1.8321, Perplexity: 6.2473
Epoch [6/500], Step [5/12], Loss: 1.7213, Perplexity: 5.59151
Epoch [6/500], Step [10/12], Loss: 1.7285, Perplexity: 5.6319
Epoch [7/500], Step [5/12], Loss: 1.6227, Perplexity: 5.06665
Epoch [7/500], Step [10/12], Loss: 1.6370, Perplexity: 5.1395
Epoch [8/500], Step [5/12], Loss: 1.5357, Perplexity: 4.64465
Epoch [8/500], Step [10/12], Loss: 1.5543, Perplexity: 4.7320
Epo

Epoch [66/500], Step [5/12], Loss: 0.1672, Perplexity: 1.18202
Epoch [66/500], Step [10/12], Loss: 0.1666, Perplexity: 1.1813
Epoch [67/500], Step [5/12], Loss: 0.1635, Perplexity: 1.17764
Epoch [67/500], Step [10/12], Loss: 0.1691, Perplexity: 1.1842
Epoch [68/500], Step [5/12], Loss: 0.1544, Perplexity: 1.16695
Epoch [68/500], Step [10/12], Loss: 0.1552, Perplexity: 1.1679
Epoch [69/500], Step [5/12], Loss: 0.1388, Perplexity: 1.14897
Epoch [69/500], Step [10/12], Loss: 0.1450, Perplexity: 1.1561
Epoch [70/500], Step [5/12], Loss: 0.1334, Perplexity: 1.14276
Epoch [70/500], Step [10/12], Loss: 0.1343, Perplexity: 1.1437
Epoch [71/500], Step [5/12], Loss: 0.1249, Perplexity: 1.13315
Epoch [71/500], Step [10/12], Loss: 0.1347, Perplexity: 1.1442
Epoch [72/500], Step [5/12], Loss: 0.1235, Perplexity: 1.13155
Epoch [72/500], Step [10/12], Loss: 0.1373, Perplexity: 1.1471
Epoch [73/500], Step [5/12], Loss: 0.1273, Perplexity: 1.13584
Epoch [73/500], Step [10/12], Loss: 0.1348, Perplexity:

Epoch [130/500], Step [10/12], Loss: 0.0044, Perplexity: 1.0044
Epoch [131/500], Step [5/12], Loss: 0.0044, Perplexity: 1.00447
Epoch [131/500], Step [10/12], Loss: 0.0043, Perplexity: 1.0043
Epoch [132/500], Step [5/12], Loss: 0.0043, Perplexity: 1.00436
Epoch [132/500], Step [10/12], Loss: 0.0042, Perplexity: 1.0042
Epoch [133/500], Step [5/12], Loss: 0.0042, Perplexity: 1.00426
Epoch [133/500], Step [10/12], Loss: 0.0041, Perplexity: 1.0041
Epoch [134/500], Step [5/12], Loss: 0.0041, Perplexity: 1.00416
Epoch [134/500], Step [10/12], Loss: 0.0040, Perplexity: 1.0040
Epoch [135/500], Step [5/12], Loss: 0.0040, Perplexity: 1.00405
Epoch [135/500], Step [10/12], Loss: 0.0039, Perplexity: 1.0039
Epoch [136/500], Step [5/12], Loss: 0.0039, Perplexity: 1.00395
Epoch [136/500], Step [10/12], Loss: 0.0039, Perplexity: 1.0039
Epoch [137/500], Step [5/12], Loss: 0.0038, Perplexity: 1.00385
Epoch [137/500], Step [10/12], Loss: 0.0038, Perplexity: 1.0038
Epoch [138/500], Step [5/12], Loss: 0.00

Epoch [194/500], Step [10/12], Loss: 0.0015, Perplexity: 1.0015
Epoch [195/500], Step [5/12], Loss: 0.0015, Perplexity: 1.00156
Epoch [195/500], Step [10/12], Loss: 0.0015, Perplexity: 1.0015
Epoch [196/500], Step [5/12], Loss: 0.0015, Perplexity: 1.00156
Epoch [196/500], Step [10/12], Loss: 0.0015, Perplexity: 1.0015
Epoch [197/500], Step [5/12], Loss: 0.0015, Perplexity: 1.00156
Epoch [197/500], Step [10/12], Loss: 0.0015, Perplexity: 1.0015
Epoch [198/500], Step [5/12], Loss: 0.0015, Perplexity: 1.00156
Epoch [198/500], Step [10/12], Loss: 0.0015, Perplexity: 1.0015
Epoch [199/500], Step [5/12], Loss: 0.0015, Perplexity: 1.00156
Epoch [199/500], Step [10/12], Loss: 0.0015, Perplexity: 1.0015
Epoch [200/500], Step [5/12], Loss: 0.0014, Perplexity: 1.00156
Epoch [200/500], Step [10/12], Loss: 0.0014, Perplexity: 1.0014
Epoch [201/500], Step [5/12], Loss: 0.0014, Perplexity: 1.00146
Epoch [201/500], Step [10/12], Loss: 0.0014, Perplexity: 1.0014
Epoch [202/500], Step [5/12], Loss: 0.00

KeyboardInterrupt: 