In [2]:
import os
import pandas as pd
import collections
import pickle
#import cPickle
import _pickle as cPickle

import scipy
from scipy import sparse
import numpy as np

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.cluster import KMeans
from sklearn.externals import joblib
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

import numpy as np
import random

%matplotlib inline
import matplotlib.pyplot as plt

import seaborn as sns

import nltk
import nltk.data
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize,sent_tokenize

import re
import string
import time
import codecs
from Bio import Medline

import gensim
from gensim import corpora, models, similarities
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec, Word2Vec, CoherenceModel
from gensim.summarization import summarize

from collections import defaultdict

import pyLDAvis.gensim

import GapStatistics
import time

import bokeh

In [2]:
# Configurations and Variable Definitions

In [8]:
# Configuration: Data file name (it should be placed in the same ditrectory as the notebook file)
#DATA_FILE = DATA_FILE = '/data/pubmedtext.txt'
#test variables
# test_pmid = '27179337'
test_pmid = '28324318'

In [9]:
# constant variables
CLEANED_ABSTRACT_COL = 'cleaned_abstract'
SUMMARY_ABSTRACT_COL = 'summary_abstract'
CLEANED_MINIMAL_ABSTRACT_COL = 'cleaned_minimal_abstract'
TOKENIZED_CLEAN_ABS_COL = 'tokenized_c_abstract'
TOKENIZED_RAW_ABS_COL = 'tokenized_r_abstract'
TOKEN_COUNT_RAW_ABS_COL = 'token_count_r_abstract'
TOKEN_COUNT_CLEAN_ABS_COL = 'token_count_c_abstract'
TOKEN_COUNT_Unq_CLEAN_ABS_COL = 'token_count_unq_c_abstract'

In [5]:
# Input File Processing

In [6]:
#Dictionary of medline terms: https://www.nlm.nih.gov/bsd/mms/medlineelements.html
#Note that this differs from above by removing grants and affiliations in an effort to minimize exceptions
def read_medline_data_raw(filename):
    recs = Medline.parse(open(filename, 'r'))
    text = pd.DataFrame(columns = ["pmid", "articletitle", "journaltitle", "abstract"])
    count = 0
    for rec in recs:
        try:
            pmid = rec['PMID']
            atitle = rec["TI"]
            jtitle = rec["JT"]
            pubdate = rec["DP"]
            abstr = rec["AB"]
            text = text.append(pd.DataFrame([[pmid,atitle, jtitle,pubdate, abstr]],
                columns=["pmid", "articletitle", "journaltitle", "pubdate", "abstract"]),ignore_index=True, verify_integrity=True)            
        except:
            pass
    text.set_index(text.pmid)
    return text

In [7]:
# get the Data file path

print("Cell started at: " + time.strftime("%c"))
try:
    approot = os.path.dirname(os.path.realpath('__file__'))
except NameError:  # if it is the main script, not a module
    import sys
    approot = os.path.dirname(os.path.abspath(sys.argv[0]))
DATA_PATH = approot + DATA_FILE

# Read in MEDLINE formatted text
%time papers = read_medline_data_raw(DATA_PATH)
papers.to_pickle('data/papers_pain_EngFilter_6_5_2017_raw_oct17update')
papers.to_csv('data/papers_pain_EngFilter_6_5_2017_raw_oct17update.csv')
print("Cell completed at: " + time.strftime("%c"))

Cell started at: Thu May 24 17:21:47 2018
CPU times: user 132 ms, sys: 4.35 ms, total: 136 ms
Wall time: 371 ms
Cell completed at: Thu May 24 17:21:47 2018


In [8]:
papers = pd.read_pickle('data/papers_pain_EngFilter_6_5_2017_raw_oct17update')

In [9]:
papers.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 5 columns):
abstract        61 non-null object
articletitle    61 non-null object
journaltitle    61 non-null object
pmid            61 non-null object
pubdate         61 non-null object
dtypes: object(5)
memory usage: 2.5+ KB


In [10]:
pubdatevalues = papers.pubdate.unique()
#pubdatevalues.tofile("pubdatevalues.csv", sep=',') Examine range of pubdates manually in csv

In [11]:
papers['pubdate_dtformat'] = pd.to_datetime(papers.pubdate, format='%Y', exact=False)
papers['pubyear'] = pd.DatetimeIndex(papers['pubdate_dtformat']).year
papers=papers.drop('pubdate_dtformat', axis=1)

In [12]:
papers.tail(10)

Unnamed: 0,abstract,articletitle,journaltitle,pmid,pubdate,pubyear
51,Non-steroidal anti-inflammatory drugs (NSAIDs)...,Anti-inflammatory and antinociceptive activiti...,International immunopharmacology,21855654,2011 Nov,2011
52,We investigated the changes in characteristics...,Expression of inflammatory and apoptosis facto...,International immunopharmacology,21821152,2011 Nov,2011
53,"INTRODUCTION: Flupirtine, a nonopioid analgesi...",Efficacy and tolerability of flupirtine in sub...,International journal of clinical pharmacology...,22011688,2011 Nov,2011
54,Post-herpetic neuralgia means pain which occur...,Modified Jaipur block for the treatment of pos...,International journal of dermatology,22004501,2011 Nov,2011
55,Antipsychotic drugs are the clinical standard ...,Dynamic regulation of dopamine and serotonin r...,The international journal of neuropsychopharma...,21281560,2011 Nov,2011
56,"BACKGROUND: Sex, race, and age disparities in ...",Patient demographic characteristics and facial...,International journal of nursing studies,21596378,2011 Nov,2011
57,BACKGROUND: Even though the use of a 25 gauge ...,Comparison of post-dural puncture headache and...,International journal of nursing studies,21561619,2011 Nov,2011
58,PURPOSE: We analyzed variables associated with...,Predictors of long-term toxicity using three-d...,"International journal of radiation oncology, b...",20933342,2011 Nov 01,2011
59,PURPOSE: To correlate computed tomography (CT)...,Correlation of computed tomography imaging fea...,"International journal of radiation oncology, b...",20889265,2011 Nov 01,2011
60,Rheumatoid arthritis (RA) is one of the inflam...,The antinociceptive efficacy of HWTX-I epidura...,International journal of sports medicine,22052031,2011 Nov,2011


In [13]:
papers.loc[papers['pubdate']=='2011 Nov']

Unnamed: 0,abstract,articletitle,journaltitle,pmid,pubdate,pubyear
0,OBJECTIVE: To define the value of multislice c...,Value of multislice computed tomography in the...,European journal of radiology,20719444,2011 Nov,2011
1,OBJECTIVES: To assess the feasibility of T2 ma...,Quantitative in vivo MRI evaluation of lumbar ...,European radiology,21748388,2011 Nov,2011
2,Intraosseous lipoma is an uncommon mesenchymal...,Intraosseous lipoma presenting as a sphenoid s...,European review for medical and pharmacologica...,22195370,2011 Nov,2011
3,OBJECTIVE: Aim of the present work is to asses...,Combined occlusal and pharmacological therapy ...,European review for medical and pharmacologica...,22195362,2011 Nov,2011
4,PURPOSE: There are few prospective studies on ...,"Surgery improves pain, function and quality of...",European spine journal : official publication ...,21706361,2011 Nov,2011
5,INTRODUCTION: Chronic pain has an impact on ps...,The effect of repeated restraint stress in pai...,European spine journal : official publication ...,21698480,2011 Nov,2011
6,PURPOSE: TNFalpha is an inflammatory mediator ...,Etanercept attenuates pain-related behavior fo...,European spine journal : official publication ...,21633793,2011 Nov,2011
7,It is becoming increasingly evident that peopl...,Effects of experimentally induced low back pai...,Experimental brain research,21952791,2011 Nov,2011
8,"AIMS: Gastrointestinal symptoms such as pain, ...",Effects of isolated hyperinsulinaemia on senso...,Experimental and clinical endocrinology & diab...,22068552,2011 Nov,2011
9,BACKGROUND: The clinical picture of pituitary ...,Hypopituitarism in a HIV affected patient.,Experimental and clinical endocrinology & diab...,21922454,2011 Nov,2011


In [14]:
papers.journaltitle.nunique()

39

In [15]:
papers.to_pickle('data/papers_pain_EngFilter_6_5_2017_raw_oct17update_Mar18year')
papers = pd.read_pickle('data/papers_pain_EngFilter_6_5_2017_raw_oct17update_Mar18year')

In [16]:
# Text Cleanup & Tokenization

In [17]:
## Remove "all-caps:" section headers, punctuation marks, numbers, and option for words in all capital letters

In [18]:
# This function will use regular expressions to remove all:
#  (1) words/phrases in all caps followed by :, (if filter_all_caps = True, eveything all-caps will be removed)     
#  (2) numbers
#  (3) one letter words (such as n or p or other mathematical symbols)
#  (4) punctuation marks
# return value is the list of all filtered words (except for numbers), and the clean abstracts

# A utility function for flattening the lists
def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str):
            for sub in flatten(el):
                yield sub
        else:
            yield el

# Filter text as described above            
def filterText(abstracts, filter_all_caps = False):    
    #define the regex pattern
    regex = re.compile('[%s]' % re.escape(re.sub('-', '', string.punctuation)))
    if filter_all_caps:
        pattern = regex.pattern + r"|\b[A-Z]{2,}|[0-9]+|\b\w\b|\d-\d|\W-\W|\s-\s" 
    else:
        pattern = regex.pattern + r"|(((\s|^)[A-Z]{2,}(,)*)*)(\s|^)(\b[A-Z]{2,}):|[0-9]+|\b\w\b|\d-\d|\W-\W|\s-\s" 
    
    #The list of all filtered words (except for numbers)
    # convert to a falttened set for faster results
    all_filtered_series = abstracts.apply(lambda d: re.findall(pattern, d))
    all_filtered = set(flatten(all_filtered_series.tolist()))
    non_digit_filtered = [y for y in all_filtered if not y.isdigit()]
    
    #remove whatever needs to be filtered
    abstracts = abstracts.apply(lambda d: re.sub(pattern, '', d))
    abstracts = abstracts.apply(lambda d: re.sub('s-s', '', d))
    
    return abstracts, non_digit_filtered

In [19]:
## Remove stop words

In [20]:
# This function will replace all the stop words, it also gets rid of the whitespaces
def removeWord(text, stop_list):
    new_text = []
    
    #first convert to a list
    text_list = str.split(text)
    
    #check for stop words
    for x in text_list:
        if x not in stop_list:
            new_text.append(x)
    
    #convert back to string
    new_text_str = ' '.join(new_text)
    
    return new_text_str

# This function will remove stop words using NLTK stop word list
# It will return processed abstracts and stop word list
def removeStopWords(abstracts):         
    # use NLTK's default stop word list
    stop_set = set(stopwords.words('english'))
    
    # find all stop words and remove them
    abstracts = abstracts.apply(lambda d: removeWord(d, stop_set))
    
    return abstracts, stop_set

In [21]:
## Convert to lower case, tokenize

In [22]:
# This fuction will convert everything to lower-case
def changeToLowerCase(abstracts):  
    abstracts = abstracts.apply(lambda d: d.lower())
    return abstracts

In [23]:
# This function will tokenize all sentences
def tokenizeAbstracts(abstracts):  
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = abstracts.apply(lambda d: tokenizer.tokenize(d))
    return tokens

In [24]:
## Combine Preprocess Functions

In [25]:
def preprocessAbstracts(abstracts):    
    # Remove all-caps words, numbers, 1-letter words, punctuation marks 
    abstracts, filtered = filterText(abstracts, filter_all_caps= False)
    
    # conevrt to lower-case
    abstracts = changeToLowerCase(abstracts)
    
    # Remove stop words
    abstracts, stop_list = removeStopWords(abstracts)
    filtered = filtered + list(stop_list)
    
    # tokenize
    tokens = tokenizeAbstracts(abstracts)
    
    return abstracts, tokens, filtered

In [26]:
# This can be helpful for word2vec models that need minimal preprocessing
def minimallyPreprocessAbstracts(abstracts):
    #define the regex pattern
    regex = re.compile('[%s]' % re.escape(re.sub('-', '', string.punctuation)))
    pattern = regex.pattern + r"|[0-9.]+" 
    
    #remove whatever needs to be filtered
    abstracts = abstracts.apply(lambda d: re.sub(pattern, '#NUM', d))
    abstracts = abstracts.apply(lambda d: re.sub('s-s', '', d))
    return abstracts

In [27]:
## Apply preprocessing

In [28]:
# preprocess the abstracts
papers[CLEANED_ABSTRACT_COL], papers[TOKENIZED_CLEAN_ABS_COL], filtered = preprocessAbstracts(papers.abstract)
papers[CLEANED_MINIMAL_ABSTRACT_COL] = minimallyPreprocessAbstracts(papers.abstract)
%time papers[TOKENIZED_RAW_ABS_COL] = tokenizeAbstracts(papers.abstract)
# pre-compute the token counts
papers[TOKEN_COUNT_CLEAN_ABS_COL] =papers[TOKENIZED_CLEAN_ABS_COL].map(lambda text: len(text))
papers[TOKEN_COUNT_RAW_ABS_COL] =papers[TOKENIZED_RAW_ABS_COL].map(lambda text: len(text))
papers[TOKEN_COUNT_Unq_CLEAN_ABS_COL] =papers[TOKENIZED_CLEAN_ABS_COL].map(lambda text: len(set(text)))

CPU times: user 4.26 ms, sys: 26 µs, total: 4.28 ms
Wall time: 4.29 ms


In [29]:
#Show list of filtered terms
print(filtered)

['', 'OPINION', 'METHODS', 'INTRODUCTION', 'AIMS', 'MATERIALS', ' AND', ' METHODS AND', 'METHOD', ' MATERIALS AND', 'BACKGROUND', ' RESULTS AND', 'COVERED', 'PARTICIPANTS', ' AREAS', 'PURPOSE', ' ', 'PATIENTS', 'AIM', ' DESIGN AND', 'SETTINGS', ' CASE', ' PATIENTS AND', 'OBJECTIVE', 'UNLABELLED', 'CONCLUSION', 'BACKGROUND AND', 'OBJECTIVES', 'RESULTS', 'DESIGN', 'LIMITATIONS', 'REPORT', ' EXPERT', 'CONCLUSIONS', 'itself', 'shan', 'was', 'wouldn', 'didn', "isn't", 'then', "won't", 'when', 'ain', 'into', 'he', 'there', 'her', "doesn't", 'ours', 'has', 'as', 'm', 'needn', 'both', 'herself', 'll', 'mightn', 'that', 'yourself', 'other', 'hers', 'himself', 'very', "haven't", 'the', "don't", 'wasn', 'are', 'after', 'over', 'and', 've', 'yours', 'here', 'through', 'those', 'shouldn', "it's", "aren't", 'hadn', 'being', 'this', 'so', 'having', "you've", 'now', 'you', 'or', 'where', 'be', 'she', 'by', 'my', 'a', 'an', 'him', 'most', "hadn't", 'own', 'o', "couldn't", "she's", "shan't", 'if', 'myse

In [3]:
#papers.to_pickle('data/papapers_pain_EngFilter_6_5_2017_raw_preprocessed_oct17update_Mar18years')
%time papers = pd.read_pickle('data/papapers_pain_EngFilter_6_5_2017_raw_preprocessed_oct17update_Mar18years')

CPU times: user 11.6 s, sys: 4.04 s, total: 15.7 s
Wall time: 24.7 s


In [4]:
papers

Unnamed: 0,abstract,articletitle,journaltitle,pmid,pubdate,pubyear,cleaned_abstract,tokenized_c_abstract,cleaned_minimal_abstract,tokenized_r_abstract,token_count_c_abstract,token_count_r_abstract,token_count_unq_c_abstract
0,pH-sensitive nonionic surfactant vesicles (nio...,pH-sensitive niosomes: Effects on cytotoxicity...,Journal of enzyme inhibition and medicinal che...,28114822,2017 Dec,2017,ph-sensitive nonionic surfactant vesicles nios...,"[ph, sensitive, nonionic, surfactant, vesicles...",pH-sensitive nonionic surfactant vesicles #NUM...,"[pH, sensitive, nonionic, surfactant, vesicles...",114,162,90
1,BACKGROUND: The prevalence of migraine with co...,"Level of physical activity, well-being, stress...",The journal of headache and pain,28421374,2017 Dec,2017,prevalence migraine co-existing tension-type h...,"[prevalence, migraine, co, existing, tension, ...",BACKGROUND#NUM The prevalence of migraine with...,"[BACKGROUND, The, prevalence, of, migraine, wi...",228,347,80
2,BACKGROUND: Identification of subgroups of pat...,Identification of subgroups of patients with t...,The journal of headache and pain,28401498,2017 Dec,2017,identification subgroups patients different le...,"[identification, subgroups, patients, differen...",BACKGROUND#NUM Identification of subgroups of ...,"[BACKGROUND, Identification, of, subgroups, of...",183,292,110
3,BACKGROUND: Burning Mouth Syndrome (BMS) is a ...,Refractory burning mouth syndrome: clinical an...,The journal of headache and pain,28357703,2017 Dec,2017,burning mouth syndrome bms chronic pain condit...,"[burning, mouth, syndrome, bms, chronic, pain,...",BACKGROUND#NUM Burning Mouth Syndrome #NUMBMS#...,"[BACKGROUND, Burning, Mouth, Syndrome, BMS, is...",149,262,114
4,BACKGROUND: Despite their huge epidemiological...,Botulinum neurotoxin type A for the treatment ...,The journal of headache and pain,28324318,2017 Dec,2017,despite huge epidemiological impact primary he...,"[despite, huge, epidemiological, impact, prima...",BACKGROUND#NUM Despite their huge epidemiologi...,"[BACKGROUND, Despite, their, huge, epidemiolog...",113,168,85
5,BACKGROUND: Calcitonin gene-related peptide (C...,Calcitonin gene-related peptide and pain: a sy...,The journal of headache and pain,28303458,2017 Dec,2017,calcitonin gene-related peptide cgrp widely di...,"[calcitonin, gene, related, peptide, cgrp, wid...",BACKGROUND#NUM Calcitonin gene-related peptide...,"[BACKGROUND, Calcitonin, gene, related, peptid...",154,244,100
6,BACKGROUND: The aim of this study was to inves...,Increased levels of intramuscular cytokines in...,The journal of headache and pain,28243900,2017 Dec,2017,aim study investigate cytokine levels masseter...,"[aim, study, investigate, cytokine, levels, ma...",BACKGROUND#NUM The aim of this study was to in...,"[BACKGROUND, The, aim, of, this, study, was, t...",179,342,92
7,BACKGROUND: Cardiac cephalalgia (CC) is a rare...,Cardiac cephalalgia: one case with cortical hy...,The journal of headache and pain,28220375,2017 Dec,2017,cardiac cephalalgia cc rare disease occurring ...,"[cardiac, cephalalgia, cc, rare, disease, occu...",BACKGROUND#NUM Cardiac cephalalgia #NUMCC#NUM ...,"[BACKGROUND, Cardiac, cephalalgia, CC, is, a, ...",93,173,68
8,"BACKGROUND: In the emergency setting, non-trau...",Copeptin for risk stratification in non-trauma...,The journal of headache and pain,28197843,2017 Dec,2017,emergency setting non-traumatic headache benig...,"[emergency, setting, non, traumatic, headache,...",BACKGROUND#NUM In the emergency setting#NUM no...,"[BACKGROUND, In, the, emergency, setting, non,...",169,301,111
9,BACKGROUND: A better understanding of potentia...,The association of headache frequency with pai...,The journal of headache and pain,28185160,2017 Dec,2017,better understanding potential relationship mo...,"[better, understanding, potential, relationshi...",BACKGROUND#NUM A better understanding of poten...,"[BACKGROUND, A, better, understanding, of, pot...",176,278,81


In [10]:
#Test
print('Original Abstract: \n', papers.abstract[papers.pmid == test_pmid])
print('\nCleaned Abstract: \n', papers.loc[papers.pmid == test_pmid, CLEANED_ABSTRACT_COL])
print('\nTokenized Abstract: \n', papers.loc[papers.pmid == test_pmid, TOKENIZED_CLEAN_ABS_COL])
print('\nMinimally Processed Abstract: \n', papers.loc[papers.pmid == test_pmid, CLEANED_MINIMAL_ABSTRACT_COL])
papers[(papers.pmid == test_pmid)]

Original Abstract: 
 4    BACKGROUND: Despite their huge epidemiological...
Name: abstract, dtype: object

Cleaned Abstract: 
 4    despite huge epidemiological impact primary he...
Name: cleaned_abstract, dtype: object

Tokenized Abstract: 
 4    [despite, huge, epidemiological, impact, prima...
Name: tokenized_c_abstract, dtype: object

Minimally Processed Abstract: 
 4    BACKGROUND#NUM Despite their huge epidemiologi...
Name: cleaned_minimal_abstract, dtype: object


Unnamed: 0,abstract,articletitle,journaltitle,pmid,pubdate,pubyear,cleaned_abstract,tokenized_c_abstract,cleaned_minimal_abstract,tokenized_r_abstract,token_count_c_abstract,token_count_r_abstract,token_count_unq_c_abstract
4,BACKGROUND: Despite their huge epidemiological...,Botulinum neurotoxin type A for the treatment ...,The journal of headache and pain,28324318,2017 Dec,2017,despite huge epidemiological impact primary he...,"[despite, huge, epidemiological, impact, prima...",BACKGROUND#NUM Despite their huge epidemiologi...,"[BACKGROUND, Despite, their, huge, epidemiolog...",113,168,85


In [11]:
# Topic Modeling with LDA

In [12]:
## Create Corpus, Dictionary, and Related Text Constructs

In [13]:
# create a dictionary mapping words to ids
abstract_list = papers[TOKENIZED_CLEAN_ABS_COL].values
ca_gs_dictionary = corpora.Dictionary(abstract_list)

#remove extremes (similar to tf-idf)
ca_gs_dictionary.filter_extremes(no_below=1, no_above=0.8)
ca_gs_dictionary.compactify()

In [14]:
#convert the dictionary to a bag of words
ca_gs_corpus = [ca_gs_dictionary.doc2bow(words) for words in abstract_list]
# corpora.mmcorpus.MmCorpus.serialize('data/ca_gensim.mm', corpus)
ca_gs_tfidf = models.TfidfModel(ca_gs_corpus)

In [15]:
## Topic Modeling via LDA

In [16]:
### Determine optimal number of topics through coherence testing

In [19]:
def LDA_compute_coherence_values(dictionary, corpus, texts, limit):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(2, limit, 6):
        model = models.ldamulticore.LdaMulticore(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=5, 
                                                 workers=16)
        model_list.append(model)
        coherencemodel = models.CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
LDA_model_list, LDA_coherence_values = LDA_compute_coherence_values(dictionary=ca_gs_dictionary, corpus=ca_gs_corpus, texts=abstract_list, limit=50)

In [None]:
print(LDA_model_list)
print(LDA_coherence_values)

In [None]:
with open('LDA_model_list.pkl', 'wb') as f:
    pickle.dump(LDA_model_list, f)
    
with open('LDA_coherence_values.pkl', 'wb') as f:
    pickle.dump(LDA_coherence_values, f)

In [None]:
limit=50
x = range(2, limit, 6)
plt.plot(x, LDA_coherence_values)
plt.xlabel("Number of LDA Topics")
plt.ylabel("Coherence score")
# plt.legend(("LDA_coherence_values"), loc='best')
plt.show()

In [None]:
for m, cv in zip(x, LDA_coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
num_of_LDA_topics = 40 #Derived from coherence testing of topic counts from 2 to 50 for LDA

In [None]:
### Run LDA with 40 topics

In [None]:
# %time lda = models.LdaModel(ca_gs_corpus, num_topics=num_of_LDA_topics, id2word =ca_gs_dictionary, update_every=5, passes=100)
%time lda_mc = models.ldamulticore.LdaMulticore(corpus=ca_gs_corpus, num_topics=num_of_LDA_topics, id2word=ca_gs_dictionary, passes=100, workers=16)

In [None]:
lda_mc.save("/data/mc_model_40topic_bharadwaj.lda")
lda_mc=gensim.models.LdaModel.load("./data/mc_model_40topic_bharadwaj.lda")

In [None]:
## Doc-Topic distribution

In [None]:
# Get document topics
all_topics = lda_mc.get_document_topics(ca_gs_corpus, minimum_probability=0)
all_topics[0]

In [None]:
## Prepare the Input files for Tensorboard

In [None]:
import gensim
import pandas as pd
import smart_open
import random
from smart_open import smart_open

In [None]:
# create file for tensors
with smart_open('doc_lda_tensor.tsv','w') as w:
    for doc_topics in all_topics:
        for topics in doc_topics:
            w.write(str(topics[1])+ "\t")
        w.write("\n")
        
# create file for metadata
with smart_open('doc_lda_metadata.tsv','w') as w:
    w.write('articletitle\tjournaltitle\n')
    for j, k in zip(papers.articletitle, papers.journaltitle):
        w.write("%s\t%s\n" % (j, k))

In [None]:
tensors = []
for doc_topics in all_topics:
    doc_tensor = []
    for topic in doc_topics:
        if round(topic[1], 3) > 0:
            doc_tensor.append((topic[0], float(round(topic[1], 3))))
    # sort topics according to highest probabilities
    doc_tensor = sorted(doc_tensor, key=lambda x: x[1], reverse=True)
    # store vectors to add in metadata file
    tensors.append(doc_tensor[:5])

# overwrite metadata file
i=0
with smart_open('doc_lda_metadata.tsv','w') as w:
    w.write('articletitle\tjournaltitle\n')
    for j,k in zip(papers.articletitle, papers.journaltitle):
        w.write("%s\t%s\n" % (''.join((str(j), str(tensors[i]))),k))
        i+=1

In [None]:
# we upload the previous tensor file "doc_lda_tensor.tsv" and this new metadata file to http://projector.tensorflow.org/ . 

In [None]:
#Visualize using T-SNE [ Just change the tab in left ]

In [None]:
lda_mc.show_topic(topicid=0, topn=15)