In [5]:
import word2vec
import numpy as np

In [6]:
word2vec.word2phrase("CHILDES_words_stemmed.txt", "childes_phrases", verbose=True)

Starting training using file CHILDES_words_stemmed.txt
Words processed: 9900K     Vocab size: 1032K  
Vocab size (unigrams + bigrams): 608615
Words in train file: 9916039


In [7]:
word2vec.word2vec("childes_phrases", "childes.bin", size=100, verbose=True)

Starting training using file childes_phrases
Vocab size: 18200
Words in train file: 9607185
Alpha: 0.000002  Progress: 100.08%  Words/thread/sec: 380.40k  

In [8]:
model = word2vec.load("childes.bin")

In [9]:
def compare_words(model, a, b):
    """
    Returns cos(theta) of vector a and b.
    a and b are strings, which are keys to 
    vectors held in the dictionary "model".
    
    model[a] and model[b] are already unit vectors
    """
    return np.dot(model[a], model[b])

In [10]:
def compare_wordpairs(model, wordpairs):
    """
    Calculates the cosine similarity of a
    list of tuples. Each tuple is a word pair.
    Returns a new list of tuples of the form:
    
    (word1, word2, cos(theta))
    
    """
    results = []
    for a, b in wordpairs:
        result = compare_words(model, a, b)
        results.append((a, b, result))
    return results
        

In [11]:
#inputs are stemmed/lemmatized 

related_word_pairs = [
    ("foot", "hand"),
    ("stroller", "car"),
    ("juic", "milk"),
    ("mouth", "nose"),
    ("book", "ball"),
    ("blanket", "diaper"),
    ("bottl", "spoon"),
    ("dog", "baby"),
    ("foot", "sock"),
    ("cooki", "banana")
    
]

unrelated_word_pairs = [
    ("sock", "juic"),
    ("milk", "foot"),
    ("banana", "nose"),
    ("mouth", "cooki"),
    ("blanket", "dog"),
    ("baby", "spoon"),
    ("book", "diaper"),
    ("juic", "car"),
    ("nose", "bottl"),
    ("hand", "stroller"),
    ("mouth", "ball")
]

In [12]:
def average_cosine(cosines):
    return sum(result[2] for result in cosines)/len(cosines)

In [13]:
related_cosines = compare_wordpairs(model, related_word_pairs)
related_cosines

[('foot', 'hand', 0.55069966145371874),
 ('stroller', 'car', 0.43117714030997534),
 ('juic', 'milk', 0.7109627977088071),
 ('mouth', 'nose', 0.69362326696385701),
 ('book', 'ball', 0.29073856030497403),
 ('blanket', 'diaper', 0.52490226748063895),
 ('bottl', 'spoon', 0.47165861800515274),
 ('dog', 'baby', 0.048514650707260847),
 ('foot', 'sock', 0.56849175319594702),
 ('cooki', 'banana', 0.52113984500368349)]

In [14]:
unrelated_cosines = compare_wordpairs(model, unrelated_word_pairs)
unrelated_cosines

[('sock', 'juic', 0.1762098996508587),
 ('milk', 'foot', 0.017896816560221504),
 ('banana', 'nose', 0.11117093069754369),
 ('mouth', 'cooki', 0.29515949890897475),
 ('blanket', 'dog', 0.221154754802494),
 ('baby', 'spoon', -0.084157055534362715),
 ('book', 'diaper', 0.29024004450549301),
 ('juic', 'car', 0.16472713773679337),
 ('nose', 'bottl', 0.29865475501722305),
 ('hand', 'stroller', 0.32651667681896479),
 ('mouth', 'ball', 0.19941315916092889)]

In [15]:
average_cosine(related_cosines)

0.48119085611340157

In [17]:
average_cosine(unrelated_cosines)

0.18336241984773938