# Doc2Vec on 20newsgroups

### This is an example for running Gensim's Doc2Vec on 20newsgroups dataset

In [2]:
# fetch the data using sklearn
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [3]:
# let's look at the documents categories in the dataset
from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [4]:
# print the number of files, the number of targets and the first 10 targets (category number)
print newsgroups_train.filenames.shape
print newsgroups_train.target.shape
print newsgroups_train.target[:10]

(11314L,)
(11314L,)
[ 7  4  4  1 14 16 13  3  2  4]


In [9]:
import gensim

In [5]:
# this is a hack over gensim's LabeledSentence
# originally gensim takes each line as a sentence and creates a "Sentence2Vec" model but we want every document
# to have a unique vector

class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
       self.labels_list = labels_list
       self.doc_list = doc_list
    def __iter__(self):
        for idx, doc in enumerate(self.doc_list):
            tag = 'DOC' + '_%s' % str(self.labels_list[idx])
            yield gensim.models.doc2vec.LabeledSentence(words=doc.split(),tags=[tag])

In [6]:
# create the document iterator for gensim's modeling
documents = LabeledLineSentence(newsgroups_train.data,range(len(newsgroups_train.data)))

In [45]:
# create a model and build the vocabulary
model = gensim.models.Doc2Vec(size=300, window=5, min_count=5, workers=11,alpha=0.025, min_alpha=0.025) # use fixed learning rate
model.build_vocab(documents)
# model.vocab

In [46]:
# train the model
for epoch in range(10):
    model.train(documents)
    model.alpha -= 0.002 # decrease the learning rate
    model.min_alpha = model.alpha # fix the learning rate, no deca
    print epoch

0
1
2
3
4
5
6
7
8
9


In [47]:
model.save('C:\Users\sweinsto\Documents\Python Scripts\doc2vec.model')

In [48]:
model.most_similar('man')

[(u'person', 0.562697172164917),
 (u'guy', 0.5462989211082458),
 (u'woman', 0.4594741463661194),
 (u'player', 0.4297860264778137),
 (u'Jew', 0.4075002372264862),
 (u'prophet', 0.3955959975719452),
 (u'patient', 0.385597825050354),
 (u'thief', 0.37435126304626465),
 (u'dog', 0.3741978108882904),
 (u'scientist', 0.37388479709625244)]

In [53]:
doc_num = 0
similar_docs = model.docvecs.most_similar(doc_num)

In [54]:
print 'base doc target is: %s' % str(newsgroups_train.target_names[newsgroups_train.target[doc_num]])

base doc target is: rec.autos


In [55]:
print 'most similar documents\' targets are:'
for doc in similar_docs:
    print newsgroups_train.target_names[newsgroups_train.target[doc[0]]]

most similar documents' targets are:
rec.sport.baseball
rec.autos
rec.autos
sci.space
alt.atheism
rec.autos
talk.politics.misc
comp.graphics
rec.sport.baseball
rec.motorcycles


In [56]:
# print the document and it's most similar document
print newsgroups_train.data[doc_num]
print '%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%'
print newsgroups_train.data[similar_docs[0][0]]


I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
Could some kind soul out there e-mail me the 411 on where I can find the mlb.c  
program?  I'm interested in some road trips this year....

