In [None]:
from PIL import Image
from scipy.misc import imread, imresize
from nltk.corpus import stopwords

import gensim
import numpy as np
import vist
import matplotlib.pyplot as plt
import pickle


stopwords = stopwords.words('english')

In [None]:
def get_sentences(sentences_ids):
    
    sentences = {}
    
    for sent_id in sentences_ids:
        sentences[sent_id] = sentences_ids[sent_id]['text']
        
    return sentences
    
    
    
def multiple_terms_to_vec(caption, model):
    """"
    Builds a uniform representation for the terms of a caption
    by summing the corresponding word vectors.
    """
    
    # split caption in terms
    terms = caption.split()
    
    # sum of the vectors of each term
    sum_of_vecs = np.zeros(300)

    # counter for words that have wordvector
    length = 0
    
    # sum the corresponding word vectors that are available 
    for term in terms:
        if term in model and term not in stopwords:
            sum_of_vecs += model[term]
            length += 1
    
    if length == 0:
        length = 1

    return (sum_of_vecs/length)



def store_sentences2vec(path, sentences, w2v_model):

    with open(path, "w") as text_file:
            text_file.write(str(len(sentences))+" 300\n")

    for key, value in sentences.items():

        vec_c = multiple_terms_to_vec(value, w2v_model)

        with open(path, "a") as text_file:
            text_file.write(key+" ")

            for i in vec_c:
                text_file.write(str(i)+" ")
            text_file.write("\n")
        #print (key,vec_c,value)
        


        
def demo_visualize_sentences2imgs(path, sentence_1, sentence_2, sentences_id):
    
    my_path = path

    sentence_1_id = sentence_1
    sentence_2_id = sentence_2

    filename = my_path+sis.Sents[sentence_1_id]['img_id']+".jpg"
    img = imread(filename, mode='RGB')

    plt.imshow(img)
    plt.axis('off')
    plt.title(filename+" - "+sis.Sents[sentence_1_id]['text'])
    plt.show()

    filename = my_path+sis.Sents[sentence_2_id]['img_id']+".jpg"
    img = imread(filename, mode='RGB')

    plt.imshow(img)
    plt.axis('off')
    plt.title(filename+" - "+sis.Sents[sentence_2_id]['text'])
    plt.show()
    
    
    
    

# Get the sentences and create the sentence2vec representations

In [None]:
print ("Getting the sentences id...............")

vist_images_dir = 'data\images'
vist_annotations_dir = 'data'
dataset_type = 'val'

sis = vist.Story_in_Sequence(vist_images_dir, vist_annotations_dir, [dataset_type])



print ("Getting the sentences...............")

sentences = get_sentences(sis.Sents)


print ("Creating the sentences2vec representations and store them in file...............")

# Load Google's pre-trained Word2Vec model.

w2v_model = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True)  

path = dataset_type+"_sentences.txt"
store_sentences2vec(path, sentences, w2v_model)

# Examples

In [None]:
# load sentences2vec model

print ("Loading the sentences2vec model...............")

dataset_type = 'val'
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(dataset_type+"_sentences.txt", binary=False)  



In [None]:
# load sentences from Vist

print ("Getting the sentences id...............")

vist_images_dir = 'data\images'
vist_annotations_dir = 'data'
dataset_type = 'val'

sis = vist.Story_in_Sequence(vist_images_dir, vist_annotations_dir, [dataset_type])

In [None]:
# get the 10th most similar sentences given the first sentence in the Vist dataset

most_similar = w2v_model.most_similar(list(sis.Sents)[0])


# print the input sentence and the most similar one 

print ("The input sentence with sentence id "+list(sis.Sents)[0]+
       " is the following : \n\n"+ sis.Sents[list(sis.Sents)[0]]['text'])

print ("\n\nThe most similar sentence is the sentence with sentence id "
       +most_similar[0][0]+" :\n\n"+ sis.Sents[most_similar[0][0]]['text'])



In [None]:
# visualize the images that belong to two sentences

path ='data/images/val/'

sentence_1_id = list(sis.Sents)[0]
sentence_2_id = most_similar[0][0]

demo_visualize_sentences2imgs(path, sentence_1_id, sentence_2_id, sis.Sents)