Exploring Gensim Doc2Vec<br>
https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5

In [1]:
import pandas as pd
import spacy
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import scipy
import pickle
%matplotlib inline

In [2]:
from spacy.vectors import Vectors
from spacy.strings import StringStore
from scipy.cluster.hierarchy import dendrogram, linkage
from wordcloud import WordCloud
from nltk.corpus import stopwords

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize

In [3]:
infile = open('descriptive_text.pickle', 'rb')
data = pickle.load(infile)
infile.close()

In [4]:
data[:5]

['The crude membranes from 5 P56-P70 Glun1TAP/TAP mouse forebrains were re-suspended in 12.5\u2009ml buffer H and extracted with 12.5\u2009ml 2% deoxycholate, 100\u2009mM NaCl, 50\u2009mM Tris.',
 'Cl pH8 for 1\u2009h at 6\u2009°C.',
 'Total extract was centrifuged at 120,000g.',
 'for 40\u2009min at 8\u2009°C.',
 'Conditions for immuno-capture, wash and peptide-antigen exchange elution were screened using a high-throughput purification robot (MAGic sample processor, Invitrogen).']

In [5]:
tagged_data = [TaggedDocument(
                            words=word_tokenize(s.lower()), 
                            tags=[str(i)]) for i, s in enumerate(data)]

In [6]:
len(tagged_data)

125808

In [7]:
%%time
# Train the model
max_epochs = 100
vec_size = 300
alpha = 0.025

model = Doc2Vec(
    vector_size=vec_size,
    alpha=alpha,
    min_alpha=0.025,
    min_count=1,
    dm=1,
    window=15,
    sample=0, #1e-5,
    negative=20, #5,
    workers=4
)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    if epoch % 10 == 0:
        print('Iteraction {0}'.format(epoch))
    model.train(
        tagged_data,
        total_examples=model.corpus_count,
        epochs=model.epochs
    )
    
    # decrease the learning rate
    model.alpha -= 0.0002
    
    # fix the learning rate, no decay
    model.min_alpha = model.alpha

Iteraction 0
Iteraction 10
Iteraction 20
Iteraction 30
Iteraction 40
Iteraction 50
Iteraction 60
Iteraction 70
Iteraction 80
Iteraction 90
CPU times: user 3h 45min 52s, sys: 1min 46s, total: 3h 47min 39s
Wall time: 59min 42s


In [8]:
model.save('./models/d2v_descriptive.model')
print("Model Saved")

Model Saved
