In [1]:
import gensim

# Create some raw documents
raw_documents = ["I love tacos.",
                "She ran with the chicken.",
                "I don't choose to take a nap. The nap chooses me.",
                "That man is nice as pie with ice cream.",
                "This pizza is an affront to nature."]

from nltk.tokenize import word_tokenize
def get_tokens(text):
    tokens = word_tokenize(text)
    return tokens

In [2]:
from gensim.models.word2vec import Word2Vec
sentences = [get_tokens(r) for r in raw_documents]
w2v_model = Word2Vec(sentences, min_count=1, vector_size=5)

# from gensim.models import KeyedVectors
# sentences = [get_tokens(r) for r in raw_documents]
# models = KeyedVectors.Word2Vec(sentences, min_count=1, vector_size=5)

model = w2v_model.wv

In [3]:
print(model['ran'])
print(model['love'])
print(model['tacos'])

[-0.03156241  0.00643374 -0.08279715 -0.15364872 -0.03017141]
[-0.192071    0.10014586 -0.17519173 -0.0878365  -0.000702  ]
[-0.00592365 -0.1532248   0.19229484  0.09964113  0.18466286]


In [4]:
vocab = list(model.index_to_key)
vocab[:10]

['.', 'nap', 'is', 'with', 'to', 'I', 'ice', 'pie', 'love', 'tacos']

In [5]:
model.most_similar('tacos')

[('me', 0.8187951445579529),
 ('choose', 0.7246150970458984),
 ('pizza', 0.5813791155815125),
 ('take', 0.5290067195892334),
 ('man', 0.5207754969596863),
 ('chooses', 0.5118007659912109),
 ('to', 0.4151745140552521),
 ('affront', 0.3722187280654907),
 ('as', 0.3721567392349243),
 ('This', 0.27691811323165894)]

In [6]:
assert gensim.models.doc2vec.FAST_VERSION > -1

In [7]:
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
dir(texts)

['DESCR', 'data', 'filenames', 'target', 'target_names']

In [27]:
print(texts.target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [8]:
print(len(texts.target))
print(texts.target)
print(texts.target_names)

11314
[7 4 4 ... 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [9]:
data = texts.data
len(data)

11314

In [10]:
data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [11]:
def get_tokens(text):
    tokens = word_tokenize(text)
    return [token.lower() for token in tokens]
print(get_tokens(data[0]))

['from', ':', 'lerxst', '@', 'wam.umd.edu', '(', 'where', "'s", 'my', 'thing', ')', 'subject', ':', 'what', 'car', 'is', 'this', '!', '?', 'nntp-posting-host', ':', 'rac3.wam.umd.edu', 'organization', ':', 'university', 'of', 'maryland', ',', 'college', 'park', 'lines', ':', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', '.', 'it', 'was', 'a', '2-door', 'sports', 'car', ',', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/', 'early', '70s', '.', 'it', 'was', 'called', 'a', 'bricklin', '.', 'the', 'doors', 'were', 'really', 'small', '.', 'in', 'addition', ',', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', '.', 'this', 'is', 'all', 'i', 'know', '.', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', ',', 'engine', 'specs', ',', 'years', 'of', 'production', ',', 'where', 'this', 'car', 'is', 'made', ',', 'history', ',', 'or', 'whatever', 'inf

In [12]:
# We will treat each document as a sentence
sentences = [get_tokens(doc) for doc in data]
print(sentences[0])

['from', ':', 'lerxst', '@', 'wam.umd.edu', '(', 'where', "'s", 'my', 'thing', ')', 'subject', ':', 'what', 'car', 'is', 'this', '!', '?', 'nntp-posting-host', ':', 'rac3.wam.umd.edu', 'organization', ':', 'university', 'of', 'maryland', ',', 'college', 'park', 'lines', ':', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', '.', 'it', 'was', 'a', '2-door', 'sports', 'car', ',', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/', 'early', '70s', '.', 'it', 'was', 'called', 'a', 'bricklin', '.', 'the', 'doors', 'were', 'really', 'small', '.', 'in', 'addition', ',', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', '.', 'this', 'is', 'all', 'i', 'know', '.', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', ',', 'engine', 'specs', ',', 'years', 'of', 'production', ',', 'where', 'this', 'car', 'is', 'made', ',', 'history', ',', 'or', 'whatever', 'inf

In [13]:
model_ng = gensim.models.word2vec.Word2Vec(sentences, min_count=3, vector_size=200)

In [14]:
model_ng.wv.most_similar('man')

[('woman', 0.8160393238067627),
 ('god', 0.7183233499526978),
 ('person', 0.713968813419342),
 ('christ', 0.7115176320075989),
 ('child', 0.7097669839859009),
 ('himself', 0.707438588142395),
 ('son', 0.7001399397850037),
 ('father', 0.6979144811630249),
 ('satan', 0.6964322328567505),
 ('spirit', 0.6781124472618103)]

In [15]:
import gensim.downloader as api
sents = api.load('text8')

In [16]:
# This will take some time to complete
w2v_model_t8 = gensim.models.word2vec.Word2Vec(sents, min_count=5, vector_size=200, epochs=15)

In [17]:
model_t8 = w2v_model_t8.wv

In [18]:
model_t8.most_similar('man')

[('woman', 0.6409082412719727),
 ('girl', 0.5279996395111084),
 ('men', 0.5224376320838928),
 ('creature', 0.4999851882457733),
 ('person', 0.49859797954559326),
 ('loner', 0.4788859188556671),
 ('sailor', 0.46047475934028625),
 ('thief', 0.4589238464832306),
 ('wight', 0.4515334367752075),
 ('boy', 0.44679781794548035)]

In [19]:
model_t8.most_similar('happy')

[('quiet', 0.6112156510353088),
 ('lucky', 0.5464048385620117),
 ('laugh', 0.511472225189209),
 ('merry', 0.5101077556610107),
 ('glad', 0.48044514656066895),
 ('shy', 0.4578081965446472),
 ('fond', 0.4572877585887909),
 ('sad', 0.45594456791877747),
 ('proud', 0.455405056476593),
 ('me', 0.45507529377937317)]

In [20]:
model_t8.most_similar(positive=['woman', 'king'], negative=['man'])

[('queen', 0.5827288031578064),
 ('isabella', 0.5057438015937805),
 ('regent', 0.48734012246131897),
 ('princess', 0.4863964021205902),
 ('monarch', 0.48449116945266724),
 ('eleonora', 0.47063368558883667),
 ('matilda', 0.4677153527736664),
 ('consort', 0.4670857787132263),
 ('throne', 0.4518029987812042),
 ('kings', 0.4495188593864441)]

In [21]:
from gensim.models.doc2vec import TaggedDocument

In [22]:
# recall sentences from Newsgroup data
sentences = [get_tokens(doc) for doc in texts.data]

In [23]:
tagged_documents = []
for i, sent in enumerate(sentences):
    tagged_documents.append(TaggedDocument(sent,["sent_{}".format(i)]))
d2v_model = gensim.models.doc2vec.Doc2Vec(tagged_documents, vector_size=300)

In [24]:
d2v_model.wv.most_similar('fast')

[('slow', 0.6519448757171631),
 ('cheap', 0.5980238914489746),
 ('quickly', 0.5742024779319763),
 ('busy', 0.566688597202301),
 ('printer', 0.5295618176460266),
 ('noticable', 0.5183477401733398),
 ('hot', 0.5129312872886658),
 ('expensive', 0.511255145072937),
 ('vacuum', 0.5100240111351013),
 ('hard', 0.5092251300811768)]

In [25]:
vec0 = d2v_model.infer_vector('i love tacos'.split())
print(vec0)

[ 3.15149911e-02  1.44660231e-02 -1.78435594e-02  1.85195859e-06
 -5.51041253e-02 -1.44342445e-02  2.37905839e-03 -1.05397892e-03
 -9.78821050e-03 -5.52256498e-03  2.30867043e-02 -7.62659265e-03
  3.08004837e-03  1.63363482e-04 -1.26051316e-02  5.65508716e-02
  3.12219765e-02  1.88399237e-02 -1.66636705e-02  2.56193317e-02
  3.68497893e-02 -1.84319559e-02  5.96815087e-02  2.73488332e-02
  4.00689431e-02  5.28140552e-03  3.28555442e-02  1.49388835e-02
  1.07796034e-02 -8.52395501e-03  8.79813638e-03  2.18394976e-02
  2.05100290e-02  1.56416390e-02  2.94732712e-02  4.19249684e-02
 -3.40541154e-02 -5.16070947e-02 -3.70674208e-03 -1.93846133e-02
  3.24069941e-03  2.52944827e-02  2.36536581e-02 -4.47735097e-03
 -7.70742772e-03  1.49477562e-02 -4.99098841e-03  2.00222265e-02
  2.99103325e-03 -3.92838614e-03 -1.28843160e-02  2.65932339e-03
 -3.40535480e-04 -7.85771757e-03  1.23418923e-02  6.56678975e-02
 -3.20133567e-02  2.96586915e-03 -1.73135065e-02 -4.25034836e-02
  1.74385533e-02 -1.17580

In [26]:
d2v_model.dv.most_similar([vec0])

[('sent_7978', 0.8453588485717773),
 ('sent_9797', 0.8379197120666504),
 ('sent_8571', 0.8346693515777588),
 ('sent_336', 0.8324801921844482),
 ('sent_3362', 0.829128086566925),
 ('sent_2882', 0.8287025690078735),
 ('sent_3436', 0.8263554573059082),
 ('sent_10169', 0.8252823352813721),
 ('sent_1184', 0.8212636709213257),
 ('sent_4285', 0.8208652138710022)]