In [12]:
from distance import *
import numpy as np
import csv

In [2]:
W, vocab, ivocab = generate_default()

In [3]:
def compare_wordpairs(model, vocab, ivocab, wordpairs):
    """
    Calculates the cosine similarity of a
    list of tuples. Each tuple is a word pair.
    Returns a new list of tuples of the form:
    
    (word1, word2, cos(theta))
    
    """
    results = []
    for a, b in wordpairs:
        result = distance(model, vocab, ivocab, a, b)
        results.append((a, b, result))
    return results

In [4]:
# inputs are stemmed/lemmatized in accordance with how the
# training data was stemmed (NLTK Snowball stemmer)

related_word_pairs = [
    ("foot", "hand"),
    ("stroller", "car"),
    ("juic", "milk"),
    ("mouth", "nose"),
    ("book", "ball"),
    ("blanket", "diaper"),
    ("bottl", "spoon"),
    ("dog", "baby"),
    ("foot", "sock"),
    ("cooki", "banana")
    
]

unrelated_word_pairs = [
    ("sock", "juic"),
    ("milk", "foot"),
    ("banana", "nose"),
    ("mouth", "cooki"),
    ("blanket", "dog"),
    ("baby", "spoon"),
    ("book", "diaper"),
    ("juic", "car"),
    ("nose", "bottl"),
    ("hand", "stroller"),
    ("mouth", "ball")
]

In [5]:
def average_cosine(cosines):
    return sum(result[2] for result in cosines)/len(cosines)

In [6]:
related_cosines = compare_wordpairs(W, vocab, ivocab, related_word_pairs)
related_cosines

[('foot', 'hand', 0.72323122722355138),
 ('stroller', 'car', 0.17299204253285755),
 ('juic', 'milk', 0.7832319075879608),
 ('mouth', 'nose', 0.76357200982325046),
 ('book', 'ball', 0.41761376366592895),
 ('blanket', 'diaper', 0.50942761827854999),
 ('bottl', 'spoon', 0.54258440559238152),
 ('dog', 'baby', -0.28228422563518762),
 ('foot', 'sock', 0.62634813595117356),
 ('cooki', 'banana', 0.50183917516606724)]

In [7]:
unrelated_cosines = compare_wordpairs(W, vocab, ivocab, unrelated_word_pairs)
unrelated_cosines

[('sock', 'juic', 0.2574629517367607),
 ('milk', 'foot', 0.16402779195825798),
 ('banana', 'nose', 0.13372606560290543),
 ('mouth', 'cooki', 0.44283698150016793),
 ('blanket', 'dog', 0.34442949677196177),
 ('baby', 'spoon', -0.12844652919609426),
 ('book', 'diaper', 0.24579499512830408),
 ('juic', 'car', 0.32906757872266612),
 ('nose', 'bottl', 0.25367195606986098),
 ('hand', 'stroller', 0.069228655563472977),
 ('mouth', 'ball', 0.35252540845220826)]

In [8]:
average_cosine(related_cosines)

0.47585560601865334

In [9]:
average_cosine(unrelated_cosines)

0.22402957748277019

In [13]:
# write out results to csv
with open("glove_cosines.csv", "wb") as output:
    writer = csv.writer(output)
    writer.writerow(["type", "word1", "word2", "cosine"])
    for pair in related_cosines:
        writer.writerow(["related"] + list(pair))
    for pair in unrelated_cosines:
        writer.writerow(["unrelated"] + list(pair))