In [4]:
from cogworks_data.language import get_data_path

from pathlib import Path
import json

# load COCO metadata
filename = get_data_path("captions_train2014.json")
with Path(filename).open() as f:
    coco_data = json.load(f)

In [20]:
from gensim.models import KeyedVectors
filename = "glove.6B.200d.txt.w2v"

# this takes a while to load -- keep this in mind when designing your capstone project
glove = KeyedVectors.load_word2vec_format(get_data_path(filename), binary=False)

In [6]:
from collections import Counter
import numpy as np
import re, string

punc_regex = re.compile('[{}]'.format(re.escape(string.punctuation)))

def strip_punc(corpus):
    return punc_regex.sub('', corpus)


def tokenize(caption):
    caption = strip_punc(caption)
    return caption.lower().split()

In [8]:
from collections import Counter
import math


In [9]:
bigCount = Counter()
captions = 0

for caption_info in coco_data["annotations"]:
    captions +=1
    caption_vocab = set(tokenize(caption_info["caption"]))
    bigCount.update(caption_vocab)
        

In [10]:
def IDFs(counter):
    """ 
    
    
    Parameters
    ----------
    vocab: List[str]
        An alphabetically-sorted list of all of the unique words in `counters`
        
    counters : Iterable[collections.Counter]
        An iterable containing {word -> count} counters for respective
        documents.
    
    Returns
    -------
    IDF: float
    
    """
    vocab = bigCount.keys()
    IDFs = {}
    for word in vocab:
        IDFs[word] = math.log10(captions/bigCount[word])
    
    return IDFs

In [11]:
IDFs(bigCount)

{'well': 2.835363489609015,
 'a': 0.05620842125384329,
 'clean': 2.6571240259330677,
 'empty': 2.2889432028231615,
 'decorated': 2.52756698137503,
 'very': 1.95436103257991,
 'and': 0.6532366353327067,
 'bathroom': 1.6168149644766716,
 'panoramic': 4.219178855589447,
 'kitchen': 1.6518701132943632,
 'appliances': 2.7738860861634747,
 'view': 2.0459925871771723,
 'of': 0.5102109700261435,
 'all': 2.3523010412519474,
 'its': 1.8651472895251568,
 'themed': 3.7086338453828342,
 'butterfly': 3.9639063504861403,
 'blue': 1.644545783334929,
 'with': 0.6027495568736877,
 'tiles': 3.266870845927321,
 'white': 1.227474433182005,
 'wall': 1.9409741839052779,
 'dining': 2.5980025738144112,
 'photo': 2.023500556131948,
 'room': 1.512154018733661,
 'stop': 1.979929442112722,
 'car': 1.9286990422587733,
 'graffitied': 4.537937618213859,
 'red': 1.5428877262411584,
 'the': 0.5598065768272859,
 'from': 1.8053428426585802,
 'across': 2.215546018585038,
 'sign': 1.6592722305533338,
 'street': 1.308597135

In [16]:
from numpy import linalg

def normalize(unnorm_embed):
    return embed/linalg.norm(embed, axis = 1)
     

In [23]:
def query_embed(query):
    embed = np.zeros(200,)
    tokens = tokenize(query)
    try:
        embed = np.sum(IDFs(bigCount)[word]*glove[word] for word in tokens)
        return normalize(embed)
    except:
        return embed

In [24]:
query_embed("a beach")

  embed = np.sum(IDFs(bigCount)[word]*glove[word] for word in tokens)


array([ 0.02438804,  0.26728424, -1.7203379 , -1.5810325 , -0.39065558,
       -0.22768249,  0.11011775, -0.48350614,  0.75438267, -1.6636043 ,
        0.30651885,  0.314019  ,  1.0048676 ,  0.47295687,  0.6385235 ,
        0.6806363 , -0.7926717 , -0.373882  , -0.15378423,  0.7450748 ,
       -0.5251249 ,  2.8194923 ,  0.8287592 ,  1.3001536 ,  1.1097711 ,
        0.16879466, -0.03542776, -0.17019661,  0.19395636, -0.90177315,
       -0.132756  ,  0.54465246, -0.64342487, -0.13407487, -0.71983284,
        0.8272356 ,  0.30532524,  0.89576834,  1.1021869 , -0.19338354,
        0.06429003,  0.44393677, -0.19121608,  0.29087716,  0.07225399,
        1.0827458 ,  1.1465335 ,  0.18960668, -0.11927484,  1.7450455 ,
       -0.5958077 , -1.2457315 , -0.36214015,  0.63413227,  0.40753612,
       -0.6246448 , -0.06934593,  0.38540742, -0.12747464, -0.1469762 ,
        0.7562299 ,  0.31943348, -1.4598258 ,  1.0781424 , -0.40651774,
       -0.43021497, -0.21083024,  0.28042692,  1.0785583 ,  0.80