In [1]:
import word2vec
import numpy as np

In [2]:
word2vec.word2phrase("CHILDES_words_stemmed.txt", "childes_phrases", verbose=True)

Starting training using file CHILDES_words_stemmed.txt
Words processed: 9900K     Vocab size: 1032K  
Vocab size (unigrams + bigrams): 608615
Words in train file: 9916039


In [3]:
word2vec.word2vec("childes_phrases", "childes.bin", size=100, verbose=True)

Starting training using file childes_phrases
Vocab size: 18200
Words in train file: 9607185
Alpha: 0.000002  Progress: 100.08%  Words/thread/sec: 394.25k  

In [4]:
model = word2vec.load("childes.bin")

In [5]:
def compare_words(model, a, b):
    return np.dot(model[a], model[b])

In [6]:
def compare_wordpairs(model, wordpairs):
    results = []
    for a, b in wordpairs:
        result = compare_words(model, a, b)
        results.append((a, b, result))
    return results
        

In [7]:
#inputs are stemmed/lemmatized 

related_word_pairs = [
    ("foot", "hand"),
    ("stroller", "car"),
    ("juic", "milk"),
    ("mouth", "nose"),
    ("book", "ball"),
    ("blanket", "diaper"),
    ("bottl", "spoon"),
    ("dog", "baby"),
    ("foot", "sock"),
    ("cooki", "banana")
    
]

unrelated_word_pairs = [
    ("sock", "juic"),
    ("milk", "foot"),
    ("banana", "nose"),
    ("mouth", "cooki"),
    ("blanket", "dog"),
    ("baby", "spoon"),
    ("book", "diaper"),
    ("juic", "car"),
    ("nose", "bottl"),
    ("hand", "stroller"),
    ("mouth", "ball")
]

In [8]:
related_cosines = compare_wordpairs(model, related_word_pairs)
related_cosines

[('foot', 'hand', 0.56778342154421191),
 ('stroller', 'car', 0.44188255887566497),
 ('juic', 'milk', 0.70017456914711618),
 ('mouth', 'nose', 0.66798379479060188),
 ('book', 'ball', 0.28796293134357476),
 ('blanket', 'diaper', 0.5355184516920084),
 ('bottl', 'spoon', 0.45433028940142123),
 ('dog', 'baby', 0.046064328756865239),
 ('foot', 'sock', 0.56756271901978494),
 ('cooki', 'banana', 0.51682242373312681)]

In [9]:
unrelated_cosines = compare_wordpairs(model, unrelated_word_pairs)
unrelated_cosines

[('sock', 'juic', 0.17634150647380531),
 ('milk', 'foot', 0.0035031428200891293),
 ('banana', 'nose', 0.15245716360294184),
 ('mouth', 'cooki', 0.26498572964811606),
 ('blanket', 'dog', 0.20387685738549116),
 ('baby', 'spoon', -0.068040768583578271),
 ('book', 'diaper', 0.28924283146169399),
 ('juic', 'car', 0.17786088384688709),
 ('nose', 'bottl', 0.30409555029466739),
 ('hand', 'stroller', 0.30985857725999399),
 ('mouth', 'ball', 0.18538889276084816)]

In [10]:
related_avg_cos = sum(result[2] for result in related_cosines)/len(related_cosines)
print "average related cos(theta): {}".format(related_avg_cos)

average related cos(theta): 0.47860854883


In [11]:
unrelated_avg_cos = sum(result[2] for result in unrelated_cosines)/len(unrelated_cosines)
print "average related cos(theta): {}".format(unrelated_avg_cos)

average related cos(theta): 0.18177912427
