In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from torch import optim
import torch
from torch import nn
from torch.autograd import Variable
import pandas
from sklearn.preprocessing import MinMaxScaler
import torchvision.models as models
import torchvision.transforms as transforms
import torch
from torch.autograd import Variable
import numpy as np
from PIL import Image
import copy
import shutil

from pycocotools.coco import COCO
import numpy as np
import skimage.io as io
import matplotlib.pyplot as plt
import pylab
pylab.rcParams['figure.figsize'] = (8.0, 10.0)
from torch.utils.data import Dataset, DataLoader

from dataset import MSCOCODataset
from torch.optim import lr_scheduler
from autocorrect import spell
import nltk
from IPython.display import display
import os

In [2]:
DEF_SEND = '<SEND>'
DEF_START = '<START>'

In [3]:
gpu_device = 3

In [4]:
dataDir='/home/p.zaydel/ProjectNeuralNets/coco_dataset/'
imagesDirTrain = '{}train2017/train2017'.format(dataDir)
imagesDirVal = '{}val2017/val2017'.format(dataDir)

annTrainFile = '{}/annotations_trainval2017/annotations/captions_train2017.json'.format(dataDir)
annValFile = '{}/annotations_trainval2017/annotations/captions_val2017.json'.format(dataDir)

In [5]:
transform_tensor = transforms.Compose([
                                transforms.ToTensor(), 
                                transforms.Normalize(
                                    mean=[0.485, 0.456, 0.406],
                                    std=[0.229, 0.224, 0.225])
                                           ])
transform_to224 = transforms.Compose([transforms.Resize((224, 224)),
                                      transform_tensor
                                     ])
transform_to500 = transforms.Compose([ transforms.Resize((500, 500)),
                                      transform_tensor
                                           ])

In [6]:
TRAIN_DATSET_FILE = 'traindataset.tar.gz'
TEST_DATSET_FILE = 'testdataset.tar.gz'

In [7]:

def split_text2words(text):
    symbs_to_replace = ['.', ',', '/', '-', ':', '{', '}', '[', ']', ]
    for smb in symbs_to_replace:
        text = text.replace(smb, ' ')
    
    
    words = nltk.word_tokenize(text.lower())
    
    for idx in range(len(words)):
        words[idx] = spell(words[idx])
    
    words = [DEF_START] + words + [DEF_SEND]
    
    return words

# def anns2words(anns_list):
#     texts = []
#     for anns in anns_list:
#         for ann in anns['anns']:
#             words = split_text2words(ann)
#             texts.append(words)
            
#     return texts

from gensim.models import Word2Vec
def train_word_to_vec_gensim(dataset, embed_size = 300):
    Texts = dataset.anns.values()
    model = Word2Vec(Texts, size = embed_size)
    return model

def generate_vocab_dicts(dataset): 
    Texts = dataset.anns.values()
    uniqwords = list(set([w for ann in Texts for w in ann]))
    words2ids = dict(zip(uniqwords, range(len(uniqwords))) )
    ids2words = dict(zip(range(len(uniqwords)), uniqwords ))
    return words2ids, ids2words


def wordslist2wordids(words, word2id, vector_length = None ):
    if vector_length is None:
        word_ids = [word2id[w] for w in words]
        
    else:
        word_ids = []
        for idx in range(vector_length):
            if idx < len(words):
                w = words[idx]
            else:
                w = end_word
                
            word_ids.append(word2id[w])
        
        if word_ids[-1] != word2id[DEF_SEND]:
            word_ids[-1] = word2id[DEF_SEND]
        
    return torch.from_numpy(np.array(word_ids).astype(np.int))


def sentence2wordids(sentence, word2id, vector_length = None):
    
    if vector_length is None:
        words = split_text2words(sentence)
        word_ids = [word2id[w] for w in words]
        
    else:
        words = split_text2words(sentence)
        word_ids = []
        for idx in range(vector_length):
            if idx < len(words):
                w = words[idx]
            else:
                w = end_word
                
            word_ids.append(word2id[w])
        
        if word_ids[-1] != word2id[DEF_SEND]:
            word_ids[-1] = word2id[DEF_SEND]
        
    return torch.from_numpy(np.array(word_ids).astype(np.int))         
    
    
import numpy as np
# calculates dimension of alexnet convolutions layers output 
def get_alexnet_features_dim(imsize):
    adim = int(np.round( 3*0.01*imsize - 1))
    return 1*256*adim*adim

In [8]:
def save_prepared_dataset(dataset, filename):
    dataset.text_transform = split_text2words
    dataset.preload_anotations()
    dataset.text_transform = None
    torch.save(dataset, filename)
    print("Dataset saved in {}".format(filename))

In [9]:
def load_anns(dataset, annids, max_len, prepare = None):
    '''
       dataset - MSCOCODataset
       annids -  tensor or numpy array
       max_len - maximum len of sentence. If None computes from dataset 
       prepare - None or function to prepare each word, returns 1-dim tensor
       
       return Pytorch Tensor [len(annids) x max_sentence_len x prepare(word).shape[0] ]
    '''
    result = []
    
    if prepare is None:
        prepare = lambda w: word_embeding[w]
    
    for i in range(annids.shape[0]):
        words = dataset.get_ann(annids[i])
        ann_res = []
        
        for idx in range(max_len):
            if idx < len(words):
                w = words[idx]
            else:
                w = DEF_SEND
                
            ann_res.append(prepare(w))
        ann_res = torch.from_numpy(np.array(ann_res)).float()
        result.append(ann_res)
        
    return torch.stack(result)
    
    
    
    

In [10]:
if os.path.exists(TRAIN_DATSET_FILE):
    print("loading train dataset...")
    trainDataset = torch.load(TRAIN_DATSET_FILE)
    print('train dataset loaded!')
else:
    trainDataset = MSCOCODataset(annTrainFile,imagesDirTrain, transform = transform_to224, mode='pic2rand')
    save_prepared_dataset(trainDataset, TRAIN_DATSET_FILE)
    
if os.path.exists(TEST_DATSET_FILE):
    print("loading test dataset...")
    testDataset = torch.load(TEST_DATSET_FILE)
    print('test dataset loaded!')
else:
    testDataset = MSCOCODataset(annValFile,imagesDirVal, transform = transform_to224, mode='pic2rand')
    save_prepared_dataset(testDataset, TEST_DATSET_FILE)



loading train dataset...
train dataset loaded!
loading test dataset...
test dataset loaded!


In [11]:
trainDataset.transform = transform_to224
testDataset.transform = transform_to224

In [12]:
# text_transform = lambda text: sentence2wordids(text, words2ids, vector_length = 20)
# trainDataset.text_transform = text_transform
# testDataset.text_transform = text_transform

In [13]:
# trainAnnCaps = [ann['caption'] for ann in trainDataset.coco.loadAnns(trainDataset.coco.getAnnIds())]

# trainAnns = trainDataset.anns.values()
# trainTexts = anns2words(trainAnns)
# sent_lengths = np.array([len(ann) for ann in trainTexts])
# print("max sent id", sent_lengths.argmax())
# print('max len',np.max(sent_lengths))
# plt.plot(np.unique(sent_lengths), np.bincount(sent_lengths)[6:])

In [14]:
# testAnns = testDataset.anns.values()
# testTexts = anns2words(testAnns)
# test_sent_lengths = np.array([len(ann) for ann in testTexts])
# print("max sent id", test_sent_lengths.argmax())
# print('max len',np.max(test_sent_lengths))
# test_bin_count = np.bincount(test_sent_lengths)
# plt.plot(np.unique(test_sent_lengths), test_bin_count[test_bin_count > 0])

In [15]:
import os
print("Creating dictionary......")
if os.path.exists('dictionaries.tar.gz'):
    print("loading dictionary")
    dic_state = torch.load('dictionaries.tar.gz')
    words2ids = dic_state['words2ids']
    ids2words = dic_state['ids2words']
    print("dictionary loaded")
else:
    words2ids, ids2words  = generate_vocab_dicts(trainDataset)
    print("saving dictionary")
    torch.save({'words2ids': words2ids, 'ids2words': ids2words }, 'dictionaries.tar.gz')

Creating dictionary......
loading dictionary
dictionary loaded


In [16]:
words2ids[DEF_START]

14388

In [17]:
# USE google word embeddings pretrined

#import gensim
#google_word_embeding = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [18]:
def pytorchEmbed_from_pretrained(embeddings, freeze=True):
    rows, cols = embeddings.shape
    embedding = torch.nn.Embedding(num_embeddings=rows, embedding_dim=cols)
    embedding.weight = torch.nn.Parameter(embeddings)
    embedding.weight.requires_grad = not freeze
    return embedding

In [19]:
# # additional train on dataset

# uniq_words = []
# for l in trainDataset.anns.values():
#     for w in l:
#         uniq_words.append(w)
# uniq_words = set(uniq_words)

# problem_words = []
# for w in uniq_words:
#     try:
#         emb = google_word_embeding[w]
#     except:
#         problem_words.append(w)
# problem_words


In [20]:
# len(problem_words)

In [21]:
# MY WORD EMBEDDINGS

import os

WORD_EMBED_FILE = 'word_embeding.tar.gz'
if os.path.exists(WORD_EMBED_FILE):
    print("loading words embedding")
    word_embeding = torch.load(WORD_EMBED_FILE)
    print("words embedding loaded")
else:
    print("creating words embedding......")
    word_embeding = train_word_to_vec_gensim(trainDataset, embed_size = 300 )
    print("saving words embedding")
    torch.save(word_embeding, WORD_EMBED_FILE)

loading words embedding
words embedding loaded


In [22]:
# emb = google_word_embeding['cat'] + google_word_embeding['kitten']
# google_word_embeding.similar_by_vector(emb)

In [26]:
# Anns = trainDataset.anns.values()
# Texts = anns2words(Anns, DEF_SEND)

In [27]:
# problem_words = []
# uniq_words = set(words2ids.keys())
# for word in tqdm.tqdm_notebook(uniq_words):
#     try:
#         emb = google_word_embeding[spell(word)]
#     except:
#         #print("w:{}  corrected:{}".format(word, spell(word)))
#         problem_words.append(word)

In [28]:
# len(problem_words)

In [29]:
#sentence2wordids(Anns[0]['anns'][3], words2ids,  vector_length = 20 )

In [25]:
trainDataLoader = DataLoader(trainDataset, batch_size = 64, shuffle=True)
testDataLoader = DataLoader(testDataset, batch_size = 64, shuffle=True)

In [17]:
# vec = word_embeding['.']
# print(vec)
# word_embeding.wv.similar_by_vector(vec)

In [18]:
# len(words2ids)

In [93]:
# h_t = Variable(torch.zeros(5, 10), requires_grad=False)
# c_t = Variable(torch.zeros(5, 10), requires_grad=False)
        
# prevWord = Variable(torch.zeros(5, 10), requires_grad=False)
# X = Variable(torch.zeros(5, 10), requires_grad=False)

# X = torch.cat([X, prevWord])
# X

In [56]:
torch.stack([torch.from_numpy(np.array([1, 2, 3])), torch.from_numpy(np.array([1, 2, 3]))],0)


 1  2  3
 1  2  3
[torch.LongTensor of size 2x3]

In [50]:
wt = torch.zeros(5, 10)
print(wt)

tt = torch.from_numpy(np.array([1.0, 2.0, 3.0, 4.0])).float()
tt = tt.repeat(5, 1)
print(tt)

torch.cat([wt, tt], dim = 1)


    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
    0     0     0     0     0     0     0     0     0     0
[torch.FloatTensor of size 5x10]


 1  2  3  4
 1  2  3  4
 1  2  3  4
 1  2  3  4
 1  2  3  4
[torch.FloatTensor of size 5x4]





Columns 0 to 12 
    0     0     0     0     0     0     0     0     0     0     1     2     3
    0     0     0     0     0     0     0     0     0     0     1     2     3
    0     0     0     0     0     0     0     0     0     0     1     2     3
    0     0     0     0     0     0     0     0     0     0     1     2     3
    0     0     0     0     0     0     0     0     0     0     1     2     3

Columns 13 to 13 
    4
    4
    4
    4
    4
[torch.FloatTensor of size 5x14]

In [123]:

class LSTM_W2V_Net(nn.Module):

    def __init__(self,  image_size, image_features_size, word_embedding, words2ids, ids2words,
                 lstm_hidden_size = 4096,
                 word_embedding_size = 300, 
                 cnn = models.alexnet(pretrained=True).features,
                 start_symbol = DEF_START,
                 end_symbol = DEF_SEND
              #   cnn_comp_features = lambda cnn, x: cnn.features(x),
              #   max_sentence_len = 20,
              #   sentence_end_embed = None,
             #  sentence_end_symbol = '.'
                  ):
        """Init NN
            image_size - size of input image.
            lstm_hidden_size - size of cnn features output
            image_features_size - size of image features vector
            word_embedding - pretrained word embedding model
            words2ids - dictionary word -> id
            ids2words - dictionary id -> word
            cnn - pretrained cnn net (alexnet, vgg and other)
            start_symbol - symbol starting sequence
            end_symbol - symbol ending sequence
        """
        
        super(LSTM_W2V_Net, self).__init__()
        self.image_size = image_size
        self.image_features_size = image_features_size
        self.cnn = cnn
     #   self.cnn_comp_features = cnn_comp_features
        
        self.vocab_size = len(words2ids)
        self.word_embedding_size = word_embedding_size
        self.word_embedding = word_embedding
        
        self.words2ids = words2ids
        self.ids2words = ids2words
        
        self.start_symbol = start_symbol
        self.start_symbol_embed = torch.from_numpy(self.word_embedding[self.start_symbol])
        
        self.end_symbol = end_symbol
        self.end_symbol_embed = torch.from_numpy(self.word_embedding[self.end_symbol])
        
#         self.sentence_end_symbol = sentence_end_symbol
#         self.sentence_end_symbol_id = self.words2ids[self.sentence_end_symbol]
        
#         if sentence_end_embed is not None:
#             self.sentence_end_embed = sentence_end_embed
#         else:
#             self.sentence_end_embed = word_embeding['.']
        
        #self.max_sentence_len = max_sentence_len
        
        
        self.lstm_hidden_size = lstm_hidden_size
        
        self.fc1 = nn.Sequential( nn.BatchNorm1d(self.image_features_size),
                                  nn.Linear(self.image_features_size, int(self.image_features_size/2)),
                                  nn.Dropout(0.001), 
                                  nn.ReLU(),
                                  nn.Linear(int(self.image_features_size/2), int(self.image_features_size/4) ),
                                  nn.Dropout(0.001),
                                  nn.ReLU(),
                                  nn.Linear(int(self.image_features_size/4), self.lstm_hidden_size),
                                  nn.BatchNorm1d(self.lstm_hidden_size)
                                )
        
        self.fc2 = nn.Sequential(#nn.Linear(self.lstm_hidden_size, self.vocab_size),
                                  nn.LogSoftmax()
                                )
        
                               
        self.lstm_cell = nn.LSTMCell(self.lstm_hidden_size + self.word_embedding_size, self.vocab_size)
        
        #self.lstm = nn.LSTM(self.lstm_hidden_size , word_embedding_size)
    
        
    
    def freeze_cnn(self):
        for param in self.cnn.parameters():
            param.requires_grad = False
    
    def unfreeze_cnn(self):
        for param in self.cnn.parameters():
            param.requires_grad = True
            
    def ids_to_embed(self, word_ids):
        result = []
        for i in range(word_ids.shape[0]):
            w = self.ids2words[word_ids[i]]
            emb = torch.from_numpy(self.word_embedding[w]).float()
            result.append(emb)
            
        return torch.stack(result, 1)
        
            
            
    def forward(self, X, max_sentence_len):
        batch_size = X.shape[0]
        X = self.cnn(X)
        X = X.view(batch_size, self.image_features_size)
        
        # prevWord = START_SYMBOL
        prevWord = Variable(self.start_symbol_embed.repeat(batch_size, 1), requires_grad=False)
        

        
        lstm_input = torch.cat([X, prevWord], dim = 1)
        
        result = []
        
        result.append(prevWord)
        
        h_t = Variable(torch.zeros(batch_size, self.lstm_hidden_size), requires_grad=False)
        c_t = Variable(torch.zeros(batch_size, self.lstm_hidden_size), requires_grad=False)
        
        print(lstm_input)
        
        for idx in range(max_sentence_len):
            h_t, c_t = self.lstm_cell.forward(lstm_input, (h_t, c_t))
            probs = self.fc2.forward(h_t)
            top_word_ids = probs.max(2)[1]
            embeds = self.ids_to_embed(top_word_ids)
            
            lstm_input = torch.cat([X, prevWord], dim = 1)
            result.append(probs)
        
        return torch.stack(result, 1)
        
    
#     def forward_old(self, X):
#         # get features from images
#         batch_size = X.shape[0]
#         #print("1: " ,X.shape)
#         X = self.cnn(X)
#         #X = X.cuda(gpu_device)
        
#         #print("2: ",X.shape)
#         X = X.view(batch_size, self.image_features_size)
        
    
#         h_t = Variable(torch.zeros(batch_size, self.lstm_hidden_size), requires_grad=False)
#         c_t = Variable(torch.zeros(batch_size, self.lstm_hidden_size), requires_grad=False)
        
#         prevWord = Variable(torch.zeros(batch_size, self.lstm_hidden_size), requires_grad=False)
        
#         X = self.fc1.forward(X)
        
#         X = torch.cat([X, prevWord])
        
#         output, hidden = self.lstm.forward()
        
#         h_t, c_t = 
        
#         h_t, c_t = self.lstm_cell.forward(X, (h_t, c_t))
        
#         output = []
#         for idx in range(self.max_sentence_len):
#             h_t, c _t = self.lstm_cell.forward(X, (h_t, c_t))
            
#             r = self.fc2.forward(h_t)
            
#             #logits = nn.LogSoftmax(r).max(2)[1]
            
#             output.append(r)
        
#         output = torch.stack(output, 1)
#         return output

SyntaxError: Missing parentheses in call to 'print'. Did you mean print(print lstm_input)? (<ipython-input-123-dc98c51dc83e>, line 119)

In [72]:
# image_size, image_features_size, word_embedding, words2ids, ids2words,
#                  lstm_hidden_size = 4096,
#                  word_embedding_size = 300, 

In [73]:


#word_embeding_size = 1024#word_embeding.trainables.layer1_size
#sentence_end_embed = 1#word_embeding[DEF_SEND]
# cnn = models.alexnet(pretrained=True).features
#sentence_end_symbol = DEF_SEND
#max_sentence_len = 20

In [117]:
image_size = 224
image_features_size = get_alexnet_features_dim(image_size)

lstmnet = LSTM_W2V_Net(image_size, image_features_size , 
                       word_embeding, words2ids, ids2words, 
                       lstm_hidden_size = 4096, 
                       word_embedding_size= word_embeding.layer1_size)

  import sys


In [124]:
cnn_model  = models.alexnet(pretrained=True).features

In [130]:
cnn_model(Variable(trainDataset[0]['image'].unsqueeze(0))).view(-1)

Variable containing:
  0.0000
  0.0000
  3.0473
   ⋮    
  0.0000
  0.0000
  0.0000
[torch.FloatTensor of size 9216]

In [118]:
trainDataLoader_2 = DataLoader(trainDataset, batch_size = 2, shuffle=True)
for sample in trainDataLoader_2:
    break


In [119]:
sample['ann_len'].max()

16

In [120]:
ann_ids = load_anns(trainDataset, sample['anns'], sample['ann_len'].max(), prepare=lambda w: words2ids[w])

In [121]:
ann_ids[0]


  4516
  6507
  9543
 19828
 11328
 16342
 17629
 15361
 20535
 15190
  4602
 16342
  8273
  7411
 20238
 16048
[torch.FloatTensor of size 16]

In [14]:
# optimizer = torch.optim.Adam(lstmnet.parameters(), lr=0.001)
# optimizer.zero_grad()

lstmnet.freeze_cnn()

X = sample['image']
X = Variable(X)

ann_ids = sample['anns']

batch_size = X.shape[0]
max_len = sample['ann_len'].max()

y = load_anns(trainDataset, ann_ids, max_len, prepare=lambda w: words2ids[w])

pred = lstmnet.forward(X, max_len)
pred

#loss = nn.NLLLoss2d()(pred.view(pred.shape[0]*pred.shape[1], pred.shape[2]), y.view(-1))


# loss.backward()
# optimizer.step()

NameError: name 'lstmnet' is not defined

Variable containing:
 0.6988
[torch.cuda.FloatTensor of size 1 (GPU 3)]

In [91]:
# y.shape

In [92]:
# pred.view(5,20* 29559)

In [20]:
#models.alexnet(pretrained=True).features(Variable(sample['image']))

In [147]:
def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'best_'+filename)
        
def open_checkpoint(is_best = False, filename='checkpoint.pth.tar'):
    if is_best:
        filename = 'best_'+filename
        
    checkpoint = torch.load(filename)
    return checkpoint
#     best_prec1 = checkpoint['best_prec1']
#     model.load_state_dict(checkpoint['state_dict'])
#     optimizer.load_state_dict(checkpoint['optimizer'])

In [149]:
# TODO train procedure
def train(network, train_dataloader, test_dataloader,
          epochs,  unfreeze_cnn_epoch = None,
          loss = nn.NLLLoss().cuda(gpu_device), optim=torch.optim.Adam ):
    
    if unfreeze_cnn_epoch is None:
        unfreeze_cnn_epoch = int(0.75 * epochs)
    
    train_loss_epochs = []
    test_loss_epochs = []
    optimizer = optim(network.parameters(), lr=0.001)
    best_test_score = 10**6
    
    network.freeze_cnn()
    
    try:
        for epoch in range(epochs):
            sheduler.step()
            if epoch >= unfreeze_cnn_epoch:
                network.unfreeze_cnn()

            losses = []
            accuracies = []
            for sample in train_dataloader:
                X = sample['image']
                X = Variable(X)
                y = sample['anns']
                
                y = 
                
                # одно изображение - одно предложение
                
                y = Variable(y)
                
                
                prediction = network(X)
                prediction = nn.LogSoftmax(prediction).max(2)[1]
                
                loss_batch = loss(prediction, y)
                losses.append(loss_batch.data[0])
                
                optimizer.zero_grad()
                loss_batch.backward()
                optimizer.step()
  
            train_loss_epochs.append(np.mean(losses))
            losses = []
            for sample in test_dataloader:
                X = sample['image']
                X = Variable(X)
                y = sample['anns']
                
                y = Variable(y)
                
                prediction = network(X)
                loss_batch = loss(prediction, y)
                losses.append(loss_batch.data[0])
                
            test_loss_epochs.append(np.mean(losses))
            
            is_best = test_loss_epochs[-1] < best_test_score
            best_test_score = min(test_loss_epochs[-1], best_test_score)
            save_checkpoint({
                            'epoch': epoch + 1,
                            'state_dict': network.state_dict(),
                            'best_test_score': best_test_score,
                            'optimizer' : optimizer.state_dict(),
                            }, is_best)
                
            
            sys.stdout.write('\rEpoch {0}... (Train/Test) MSE: {1:.3f}/{2:.3f}'.format(
                        epoch, train_loss_epochs[-1], test_loss_epochs[-1]))
    except KeyboardInterrupt:
        pass
    plt.figure(figsize=(12, 5))
    plt.plot(train_loss_epochs[1:], label='Train')
    plt.plot(test_loss_epochs[1:], label='Test')
    plt.xlabel('Epochs', fontsize=16)
    plt.ylabel('Loss', fontsize=16)
    plt.legend(loc=0, fontsize=16)
    plt.grid('on')
    plt.show()