In [21]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import pandas as pd
from nltk import RegexpTokenizer


# text Preprocessing

In [57]:
tokenizer = RegexpTokenizer(r'\w+')
stopword_set = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])


"""This function does all cleaning of data using two objects above"""
def textCleaning(data):
    new_str = data.lower()
    dlist = tokenizer.tokenize(new_str)
    dlist = list(set(dlist).difference(stopword_set))
    return dlist[::-1]

textCleaning("heLLO! World,")

['hello', 'world']

# Traning D2V model

In [56]:
"""get the text corpus from article source"""
f = pd.read_csv('kaggle/articles1.csv')
textCorpus = f["content"]
textCorpus = [textCleaning(text) for idx, text in textCorpus.iteritems()]
textCorpus[0]

['least',
 'new',
 'purse',
 'spur',
 'republicans',
 'predicted',
 'violates',
 'evaluate',
 'phillip',
 'people',
 'appealed',
 'produce',
 'government',
 'receive',
 'consequences',
 'confidence',
 'implode',
 'time',
 'white',
 'win',
 'vigorous',
 'trump',
 'destabilize',
 'angering',
 'despite',
 'spokesman',
 'coverage',
 'questions',
 'complicated',
 'lack',
 'stopped',
 'congressional',
 'speaker',
 'era',
 'place',
 'behalf',
 'years',
 'market',
 'litigation',
 'election',
 'broad',
 'month',
 'asserted',
 'paying',
 'reversed',
 'reports',
 'potential',
 'reductions',
 'decision',
 'acknowledge',
 'cascading',
 'approval',
 'drop',
 'possibility',
 'deductibles',
 'chaos',
 'found',
 'widespread',
 'proceedings',
 'temporarily',
 'lawyers',
 'one',
 'suit',
 'disputed',
 'whether',
 'ugly',
 'temporary',
 'restore',
 'congress',
 'said',
 'obama',
 'inappropriate',
 'settle',
 'remained',
 'defend',
 'leverage',
 'required',
 'leadership',
 'views',
 'authority',
 'stave',


In [None]:
"""Tag documents"""
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(textCorpus)]

"""Set model parameters"""
max_epochs = 10
vec_size   = 100
alpha      = 0.025


"""Initialise model"""
textModel = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)


"""Build vocab from tagged documents"""
textModel.build_vocab(documents)


"""Training D2V model"""
for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    textModel.train(documents,
                total_examples=textModel.corpus_count,
                epochs=textModel.iter)
    # decrease the learning rate
    textModel.alpha -= 0.0002
    # fix the learning rate, no decay
    textModel.min_alpha = textModel.alpha


textModel.save("d2v.model")
print("Model Saved")

# Document Vectorization with D2V Model

In [59]:
"""load pretrained model"""
textModel = Doc2Vec.load("d2v.model")

"""Get and clean article content"""
articleContent = "This is a sample article"
articleContent = textCleaning(articleContent)
articleContent

['article', 'sample']

In [60]:
vector = textModel.infer_vector(articleContent)
vector

array([ 2.0245090e-03,  1.5786994e-03,  1.0835961e-03,  2.6223890e-04,
       -1.6445892e-03,  1.5440167e-04, -4.2017670e-03,  3.5304213e-03,
        2.0168030e-03,  3.8664429e-03, -2.5916628e-03, -1.7409977e-04,
        2.5245168e-03, -4.8722062e-04, -3.7868381e-03, -2.0139995e-03,
       -3.4781564e-03,  1.7303188e-03,  4.1704942e-04,  1.8745348e-03,
        1.7303237e-03,  4.9541923e-03,  1.1248480e-03,  4.9768523e-03,
       -2.1250462e-03,  4.8431065e-03, -3.2328602e-03, -4.9750758e-03,
        4.4665211e-03, -3.0762405e-04,  2.7447208e-04, -2.2675802e-03,
        3.4414111e-03, -2.4733644e-03,  1.1825904e-03,  1.6799474e-03,
       -9.1594399e-04, -4.2598774e-03,  2.5454327e-03,  3.0918776e-03,
       -2.5551862e-03, -3.4812214e-03,  2.8377303e-03,  3.6360780e-03,
        1.2140333e-03, -2.5136117e-03, -1.1437719e-03, -4.5459303e-03,
        3.5177174e-03,  4.1979630e-04, -2.5457493e-03,  3.0788423e-03,
        9.0674097e-05, -3.0712376e-03,  2.1640278e-04, -4.5869546e-03,
      