http://rare-technologies.com/deep-learning-with-word2vec-and-gensim/

In [20]:
# import modules and set up logging
from gensim.models import word2vec
import logging, os

In [21]:
data_dir = os.environ["DATA_DIR"] = "/home/jupyterhub/hostdir/docker-jupyterhub/experiments/data/"

In [22]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [33]:
# load up unzipped corpus from http://mattmahoney.net/dc/text8.zip
sentences = word2vec.Text8Corpus(data_dir + "text8")

<gensim.models.word2vec.Text8Corpus object at 0x7f3b909cb048>


In [24]:
# train the skip-gram model; default window=5
model = word2vec.Word2Vec(sentences, size=200)

In [25]:
# ... and some hours later... just as advertised...
model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

[('queen', 0.6307381391525269)]

In [26]:
# pickle the entire model to disk, so we can load&resume training later
model.save(data_dir + 'text8.model')

In [27]:
# store the learned weights, in a format the original C tool understands
model.save_word2vec_format(data_dir + 'text8.model.bin', binary=True)

In [28]:
# or, import word weights created by the (faster) C word2vec
# this way, you can switch between the C/Python toolkits easily
model = word2vec.Word2Vec.load_word2vec_format(data_dir + 'text8.model.bin', binary=True)

In [29]:
# "boy" is to "father" as "girl" is to ...?
model.most_similar(['girl', 'father'], ['boy'], topn=3)

[('mother', 0.7793086767196655),
 ('wife', 0.7022637724876404),
 ('grandmother', 0.6899971961975098)]

In [30]:
more_examples = ["he his she", "big bigger bad", "going went being"]
for example in more_examples:
    a, b, x = example.split()
    predicted = model.most_similar([x, b], [a])[0][0]
    print("'%s' is to '%s' as '%s' is to '%s'" % (a, b, x, predicted))

'he' is to 'his' as 'she' is to 'her'
'big' is to 'bigger' as 'bad' is to 'worse'
'going' is to 'went' as 'being' is to 'was'


In [31]:
# which word doesn't go with the others?
model.doesnt_match("breakfast cereal dinner lunch".split())

'cereal'