In [1]:
import gensim

# Create some raw documents
raw_documents = ["I love tacos.",
                "She ran with the chicken.",
                "I don't choose to take a nap. The nap chooses me.",
                "That man is nice as pie with ice cream.",
                "This pizza is an affront to nature."]
from nltk.tokenize import word_tokenize
def get_tokens(text):
    tokens = word_tokenize(text)
    return tokens

In [2]:
from gensim.models.word2vec import Word2Vec
sentences = [get_tokens(r) for r in raw_documents]
model = Word2Vec(sentences, min_count=1, size=5)

# from gensim.models import KeyedVectors
# sentences = [get_tokens(r) for r in raw_documents]
# models = KeyedVectors.Word2Vec(sentences, min_count=1, size=5)

print(model['ran'])
print(model['love'])
print(model['tacos'])

[-0.03438446 -0.07316932  0.04117243 -0.04679364 -0.04743984]
[-0.03010107 -0.09813738 -0.02317511 -0.068779    0.05996513]
[-0.00374706  0.00401464 -0.05434585 -0.07617548 -0.03022479]


In [3]:
vocab = list(model.wv.vocab.keys())
vocab[:10]

['ran', 'tacos', 'the', 'The', 'as', '.', 'love', 'She', "n't", 'affront']

In [4]:
model.most_similar('tacos')

[('chooses', 0.9242254495620728),
 ('pie', 0.7201449871063232),
 ('choose', 0.6462243795394897),
 ('ice', 0.5596035718917847),
 ('.', 0.4698696434497833),
 ('love', 0.32112064957618713),
 ('chicken', 0.25919628143310547),
 ('ran', 0.23415185511112213),
 ('nature', 0.21226002275943756),
 ('I', 0.15366244316101074)]

In [5]:
assert gensim.models.doc2vec.FAST_VERSION > -1

In [6]:
from sklearn.datasets import fetch_20newsgroups
texts = fetch_20newsgroups(subset='train')
dir(texts)

['DESCR', 'data', 'description', 'filenames', 'target', 'target_names']

In [7]:
print(texts.filenames)

[ '/home/scott/scikit_learn_data/20news_home/20news-bydate-train/rec.autos/102994'
 '/home/scott/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51861'
 '/home/scott/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.mac.hardware/51879'
 ...,
 '/home/scott/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.ibm.pc.hardware/60695'
 '/home/scott/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38319'
 '/home/scott/scikit_learn_data/20news_home/20news-bydate-train/rec.motorcycles/104440']


In [8]:
print(len(texts.target))
print(texts.target)
print(texts.target_names)

11314
[7 4 4 ..., 3 1 8]
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [9]:
data = texts.data
len(data)

11314

In [10]:
data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

In [11]:
def get_tokens(text):
    tokens = word_tokenize(text)
    return [token.lower() for token in tokens]
print(get_tokens(data[0]))

['from', ':', 'lerxst', '@', 'wam.umd.edu', '(', 'where', "'s", 'my', 'thing', ')', 'subject', ':', 'what', 'car', 'is', 'this', '!', '?', 'nntp-posting-host', ':', 'rac3.wam.umd.edu', 'organization', ':', 'university', 'of', 'maryland', ',', 'college', 'park', 'lines', ':', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', '.', 'it', 'was', 'a', '2-door', 'sports', 'car', ',', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/', 'early', '70s', '.', 'it', 'was', 'called', 'a', 'bricklin', '.', 'the', 'doors', 'were', 'really', 'small', '.', 'in', 'addition', ',', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', '.', 'this', 'is', 'all', 'i', 'know', '.', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', ',', 'engine', 'specs', ',', 'years', 'of', 'production', ',', 'where', 'this', 'car', 'is', 'made', ',', 'history', ',', 'or', 'whatever', 'inf

In [12]:
# We will treat each document as a sentence
sentences = [get_tokens(doc) for doc in data]
print(sentences[0])

['from', ':', 'lerxst', '@', 'wam.umd.edu', '(', 'where', "'s", 'my', 'thing', ')', 'subject', ':', 'what', 'car', 'is', 'this', '!', '?', 'nntp-posting-host', ':', 'rac3.wam.umd.edu', 'organization', ':', 'university', 'of', 'maryland', ',', 'college', 'park', 'lines', ':', '15', 'i', 'was', 'wondering', 'if', 'anyone', 'out', 'there', 'could', 'enlighten', 'me', 'on', 'this', 'car', 'i', 'saw', 'the', 'other', 'day', '.', 'it', 'was', 'a', '2-door', 'sports', 'car', ',', 'looked', 'to', 'be', 'from', 'the', 'late', '60s/', 'early', '70s', '.', 'it', 'was', 'called', 'a', 'bricklin', '.', 'the', 'doors', 'were', 'really', 'small', '.', 'in', 'addition', ',', 'the', 'front', 'bumper', 'was', 'separate', 'from', 'the', 'rest', 'of', 'the', 'body', '.', 'this', 'is', 'all', 'i', 'know', '.', 'if', 'anyone', 'can', 'tellme', 'a', 'model', 'name', ',', 'engine', 'specs', ',', 'years', 'of', 'production', ',', 'where', 'this', 'car', 'is', 'made', ',', 'history', ',', 'or', 'whatever', 'inf

In [13]:
model_ng = gensim.models.word2vec.Word2Vec(sentences, min_count=3, size=200)

In [14]:
model_ng.most_similar('man')

[('woman', 0.8280535340309143),
 ('father', 0.743359386920929),
 ('christ', 0.7305841445922852),
 ('god', 0.7270018458366394),
 ('person', 0.7143691778182983),
 ('son', 0.7122769951820374),
 ('lord', 0.7112933397293091),
 ('spirit', 0.708041787147522),
 ('himself', 0.6960251927375793),
 ('sin', 0.6929311752319336)]

In [None]:
sents = gensim.models.word2vec.Text8Corpus('~/Documents/nlp-python/word_vectors/text8')

In [None]:
model_t8 = gensim.models.word2vec.Word2Vec(sents, min_count=5, size=200, iter=15)

In [None]:
model_t8.most_similar('man')

In [None]:
model_t8.most_similar('happy')

In [None]:
model_t8.most_similar(positive=['woman', 'king'], negative=['man'])

In [None]:
from gensim.models.doc2vec import TaggedDocument

In [None]:
# recall sentences from Newsgroup data
sentences = [get_tokens(doc) for doc in texts.data]

In [None]:
tagged_documents = []
for i, sent in enumerate(sentences):
    tagged_documents.append(TaggedDocument(sent,["sent_{}".format(i)]))
d2v_model = gensim.models.doc2vec.Doc2Vec(tagged_documents, size=300)

In [None]:
d2v_model.most_similar('fast')

In [None]:
vec0 = d2v_model.infer_vector('i love tacos'.split())
print(vec0)

In [None]:
d2v_model.docvecs.most_similar([vec0])