In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import utils
# random
import random

# numpy
import numpy

# classifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import  LinearSVC

In [2]:
class TaggedLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        flipped = {}
        self.tagToText = dict()

        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')

    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield TaggedDocument(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])

    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    text = utils.to_unicode(line).split()
                    tag = prefix + '_%s' % item_no
                    self.sentences.append(TaggedDocument(text, [tag]))
                    self.tagToText[tag] = text
        return(self.sentences)

    def sentences_perm(self):
        shuffled = list(self.sentences)
        random.shuffle(shuffled)
        return(shuffled)

In [3]:
sources = {'data/test_neg.txt':'TEST_NEG', 'data/test_pos.txt':'TEST_POS', 'data/train_neg.txt':'TRAIN_NEG', 'data/train_pos.txt':'TRAIN_POS'}

sentences = TaggedLineSentence(sources)
 
model = Doc2Vec(min_count=1, window=10, vector_size=100, sample=1e-4, negative=5, workers=7)
model.build_vocab(sentences.to_array())
model.train(sentences.sentences_perm(), epochs=10, total_examples=model.corpus_count)

### find the most similar words

In [4]:
model.wv.most_similar('good')

[('great', 0.8010649085044861),
 ('decent', 0.7656506896018982),
 ('good,', 0.7534111738204956),
 ('nice', 0.7411749362945557),
 ('good.', 0.7183130979537964),
 ('bad', 0.7146064043045044),
 ('fine', 0.6791468858718872),
 ('really', 0.661880612373352),
 ('great,', 0.6555169820785522),
 ('solid', 0.6400054097175598)]

In [5]:
model['TRAIN_NEG_0']

array([-5.67604452e-02,  6.85367063e-02,  8.34903643e-02, -8.47806633e-02,
        6.58253059e-02,  4.05879766e-02,  5.57096116e-02, -4.13171910e-02,
        1.20364886e-03, -1.33396551e-01, -1.40106753e-02,  7.09639564e-02,
        1.84043162e-02, -5.35935760e-02,  4.90804464e-02,  7.02047348e-02,
        4.91940193e-02,  7.05393553e-02,  2.74535976e-02,  5.42438552e-02,
        1.09348230e-01,  5.09758852e-02,  2.62266807e-02, -1.06784329e-02,
        2.43218560e-02,  6.12404943e-02,  8.86478424e-02, -1.53240353e-01,
        3.61667499e-02,  3.92867699e-02,  2.56076995e-02,  1.80427695e-03,
        6.02349080e-02,  2.03826167e-02,  3.46294692e-05, -1.59500074e-02,
       -4.92557921e-02, -4.96926531e-02,  3.45059223e-02,  1.88102238e-02,
       -4.85540740e-02,  9.30692926e-02,  1.13568008e-01, -5.72237074e-02,
        7.36734504e-03, -1.10213488e-01, -2.29404103e-02,  7.66101480e-02,
        2.65874937e-02, -5.14724925e-02, -1.13697939e-01,  4.48653921e-02,
        2.75861491e-02,  

In [6]:
# model.save('./imdb.d2v')

In [7]:
# model = Doc2Vec.load('./imdb.d2v')

### do classfication and it can classify 83% correctly

In [8]:
train_arrays = numpy.zeros((25000, 100))
train_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_train_pos = 'TRAIN_POS_' + str(i)
    prefix_train_neg = 'TRAIN_NEG_' + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_arrays[12500 + i] = model[prefix_train_neg]
    train_labels[i] = 1
    train_labels[12500 + i] = 0

In [9]:
print(train_arrays)

[[-0.04092867  0.05030482  0.01592999 ...  0.0494524   0.08508648
  -0.12458809]
 [-0.06956087  0.53839409  0.29971007 ...  0.1563656  -0.2468898
  -0.20472236]
 [ 0.04621632  0.15324871  0.00701381 ...  0.10775265 -0.07272544
  -0.03794105]
 ...
 [-0.08181398  0.33077008  0.15650262 ...  0.09846954 -0.19612412
  -0.0578531 ]
 [ 0.0962573   0.40005031  0.12608531 ... -0.06523082 -0.17672288
  -0.03303262]
 [ 0.04914996  0.18799147  0.00920538 ...  0.11031672  0.03501993
  -0.0096805 ]]


In [10]:
print(train_labels)

[1. 1. 1. ... 0. 0. 0.]


In [11]:
test_arrays = numpy.zeros((25000, 100))
test_labels = numpy.zeros(25000)

for i in range(12500):
    prefix_test_pos = 'TEST_POS_' + str(i)
    prefix_test_neg = 'TEST_NEG_' + str(i)
    test_arrays[i] = model[prefix_test_pos]
    test_arrays[12500 + i] = model[prefix_test_neg]
    test_labels[i] = 1
    test_labels[12500 + i] = 0

In [12]:
classifier = LogisticRegression(solver = 'liblinear')
classifier.fit(train_arrays, train_labels)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False)

In [13]:
classifier.score(test_arrays, test_labels)

0.83612

### find the most similar sentences in training s

In [20]:
# 寻找跟train训练集的负面第一条评论的最相像评论
sims = model.docvecs.most_similar([model['TRAIN_NEG_1']],topn=10)
print(sims)
print(sentences.tagToText[sims[0][0]])

[('TRAIN_NEG_1', 0.9999998807907104), ('TEST_NEG_7979', 0.6596706509590149), ('TRAIN_NEG_677', 0.6457411050796509), ('TRAIN_NEG_11997', 0.6352208256721497), ('TRAIN_NEG_2542', 0.6332000494003296), ('TEST_NEG_3255', 0.6267741918563843), ('TEST_NEG_1168', 0.6215671300888062), ('TEST_NEG_8942', 0.6147149801254272), ('TEST_NEG_9452', 0.6145933866500854), ('TEST_NEG_9643', 0.6119739413261414)]


In [34]:
new_text = "I love all the Devilman Crybaby vibes I get from this show!!! It's disturbingly gruesome at times and \
laugh out loud funny at others. I haven't even watched the entire season but I'm already looking forward to season 2! \
If you loved Devilman Crybaby or Black Mirror, you'll love this show. If you aren't sure about it, just give it a try because it's honestly so good!!!"
tokens = new_text.split()
new_vector = model.infer_vector(tokens)
sims = model.docvecs.most_similar([new_vector],topn=10)
print(sims)
print('============new_text:=============\n', new_text)
print('=========most similar text:=========')
print(' '.join(sentences.tagToText[sims[0][0]]))

[('TEST_POS_2868', 0.8263417482376099), ('TRAIN_POS_2462', 0.8107128143310547), ('TRAIN_POS_3098', 0.8102821707725525), ('TRAIN_POS_11410', 0.8099147081375122), ('TEST_POS_5559', 0.8034120202064514), ('TRAIN_POS_1738', 0.803398847579956), ('TEST_POS_4882', 0.8029893040657043), ('TEST_POS_10926', 0.7982553243637085), ('TEST_NEG_340', 0.7977356314659119), ('TEST_NEG_6772', 0.797174334526062)]
 I love all the Devilman Crybaby vibes I get from this show!!! It's disturbingly gruesome at times and laugh out loud funny at others. I haven't even watched the entire season but I'm already looking forward to season 2! If you loved Devilman Crybaby or Black Mirror, you'll love this show. If you aren't sure about it, just give it a try because it's honestly so good!!!
This film is the best kung fu film of all time. Although there is not wire-work and special effects like those used in Crouching Tiger, this movie uses ingenuity and creative camera-work to create memorable fighting moments, and the f