In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import dtale
import pymongo
import csv
import scispacy
import spacy
from sklearn.utils import parallel_backend   
nlp = spacy.load("en_core_sci_lg")
warnings.filterwarnings('ignore')

In [11]:
import gensim.corpora as corpora

## Base functions

In [2]:
def get_data():

    """import data from MongoDB"""

    myclient = pymongo.MongoClient("mongodb+srv://lucas-deepen:DSIqP935gtFobYc2@cluster0.ixkyxa7.mongodb.net/?retryWrites=true&w=majority")
    mydb = myclient["cleanpapers"]
    mycol = mydb["cleanedf"]
    mydoc = mycol.find({}, {"_id":1,"articleTitle":1,"abstract":1,"pubDate":1,"affiliations":1})

    print('----------Data imported----------')

    return mydoc

In [3]:
def dataframe(mydoc,length=132820):

    """convert mongodb data to dataframe (full = 132820 rows)"""
    
    # data to dataframe and limit length

    df = pd.DataFrame(list(mydoc)).set_index(['_id'])

    df = df[df.abstract != '.'].iloc[:length,:]

    # extract year from the pubDate column

    df['pubDate'] = df['pubDate'].str.extract(r'(\d{4})')

    print ('----------DataFrame created----------')

    print (df.head(15))

    return df

In [4]:
def cleaning(text):

    """cleaning function for the abstract"""
    
    # extract medical terms
      
    doc = nlp(text)
    
    doc_string = " ".join(str(a) for a in doc.ents)

    # transform abtract words into lower case

    words = doc_string.lower()

    # remove punctuations

    for punctuation in string.punctuation:

        words = words.replace(punctuation,'')

    # remove digits

    words = ''.join(char for char in words if not char.isdigit())

    # tokenize sentences

    tokenized_text = word_tokenize(words)

    # remove stop words

    stop_words = set(stopwords.words('english'))


    tokenized_sentence_cleaned = [w for w in tokenized_text
                                if not w in stop_words]

    # standardize verbs

    verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")
            for word in tokenized_sentence_cleaned]

    # standardize nouns

    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
            for word in verb_lemmatized]
    
    # only words longer than 3 charachters:
    
    length_3 = [ word for word in noun_lemmatized if len(word) > 3 ]
    
    # re-join list into sentence

    cleaned_txt = " ".join(length_3)

    return cleaned_txt

In [32]:
def cleaning_ginsem(text):

    """cleaning function for the abstract"""
    
    # extract medical terms
      
    doc = nlp(text)
    
    doc_string = " ".join(str(a) for a in doc.ents)

    # transform abtract words into lower case

    words = doc_string.lower()

    # remove punctuations

    for punctuation in string.punctuation:

        words = words.replace(punctuation,'')

    # remove digits

    words = ''.join(char for char in words if not char.isdigit())

    # tokenize sentences

    tokenized_text = word_tokenize(words)

    # remove stop words

    stop_words = set(stopwords.words('english'))


    tokenized_sentence_cleaned = [w for w in tokenized_text
                                if not w in stop_words]

    # standardize verbs

    verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")
            for word in tokenized_sentence_cleaned]

    # standardize nouns

    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
            for word in verb_lemmatized]
    
    # only words longer than 3 charachters:
    
    length_3 = [ word for word in noun_lemmatized if len(word) > 3 ]
    
    return length_3

In [9]:
def clean(df):
    
    """clean abstract"""

    df_ = df.copy()

    # apply clean function to abstracts

    df_.abstract = df_.abstract.astype(str).apply(cleaning)
    
    return df_

In [31]:
def clean_ginsem(df):
    
    """clean abstract"""

    df_ = df.copy()

    # apply clean function to abstracts

    df_.abstract = df_.abstract.astype(str).apply(cleaning_ginsem)
    
    return df_

In [18]:
def tokenize(df):

    """generate tokenized dataframe"""

    # intitialize vectorizer model

    tfidf_vectorizer = TfidfVectorizer(use_idf=True,
                                analyzer='word',
                                stop_words='english',
                                max_df=0.6,min_df=0.01)#,
                                #max_features=10000)

    # fit_transform abstract

    tfidf_abstract = tfidf_vectorizer.fit_transform(df.abstract)

    # create data frame with columns names

    weighted_words = pd.DataFrame(tfidf_abstract.toarray(),
                columns = tfidf_vectorizer.get_feature_names(),index=df.index).round(2)

    print ('----------Abstract tokenized----------')

    print (weighted_words.head(15))

    return weighted_words

## Tokenize

In [6]:
data = get_data()

----------Data imported----------


In [7]:
df = dataframe(data)

----------DataFrame created----------
                                                   abstract  \
_id                                                           
34314384  Intracortical microelectrode arrays (MEA) can ...   
33996894  Medulloblastoma is the most common malignant c...   
33862118  Nod-like receptor family pyrin domain containi...   
33691255  Mice with chronic cochlear implants can signif...   
33332038  An Auditory Brainstem Implant (ABI) is a techn...   
31201186  Tinnitus may have a very severe impact on the ...   
35509538  Manufacturing of customized three-dimensional ...   
35024600  Injectable hydrogel has the advantage to fill ...   
34425566  The evaluation of the long-term stability of E...   
33762926  Mitochondria are organelles responsible for bi...   
33647494  Evolutions in cranioplasty have allowed for th...   
33431445  A 42-year-old woman presented with fever, left...   
33318954  An estimated 3.8 million traumatic brain injur...   
33025785  Modern 

In [10]:
clean_abstract = clean(df)

In [19]:
token = tokenize(clean_abstract)

----------Abstract tokenized----------
          aberrant  ability  abnormal  abnormality  absence  access  \
_id                                                                   
34314384       0.0      0.0      0.00          0.0      0.0     0.0   
33996894       0.0      0.0      0.00          0.0      0.0     0.0   
33862118       0.0      0.0      0.00          0.0      0.0     0.0   
33691255       0.0      0.0      0.00          0.0      0.0     0.0   
33332038       0.0      0.0      0.00          0.0      0.0     0.0   
31201186       0.0      0.0      0.00          0.0      0.0     0.0   
35509538       0.0      0.0      0.00          0.0      0.0     0.0   
35024600       0.0      0.0      0.00          0.0      0.0     0.0   
34425566       0.0      0.0      0.00          0.0      0.0     0.0   
33762926       0.0      0.0      0.09          0.0      0.0     0.0   
33647494       0.0      0.0      0.00          0.0      0.0     0.0   
33431445       0.0      0.0      0.00 

In [20]:
token

Unnamed: 0_level_0,aberrant,ability,abnormal,abnormality,absence,access,accumulation,accuracy,accurate,acid,...,wildtype,window,woman,work,world,worsen,xray,year,young,younger
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34314384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33996894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33862118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33691255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33332038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35519270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35519265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35511603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35510871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## LDA ginsem

In [48]:
import gensim
from gensim.models import TfidfModel, LdaMulticore

In [39]:
text = list(clean_abstract.abstract.str.split())

In [40]:
id2word = corpora.Dictionary(text)

In [43]:
corpus = [id2word.doc2bow(text) for text in text]

In [51]:
tfidf = TfidfModel(corpus)

In [52]:
tfidf_corpus = tfidf[corpus[0]]

In [53]:
lda_ginsem = LdaMulticore(tfidf_corpus, id2word=id2word, num_topics=10,workers=3)

Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/home/lbamagalhaes/.pyenv/versions/3.8.12/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/lbamagalhaes/.pyenv/versions/3.8.12/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/lbamagalhaes/.pyenv/versions/3.8.12/lib/python3.8/multiprocessing/pool.py", line 109, in worker
    initializer(*initargs)
  File "/home/lbamagalhaes/.pyenv/versions/3.8.12/envs/DeepSearch/lib/python3.8/site-packages/gensim/models/ldamulticore.py", line 346, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
  File "/home/lbamagalhaes/.pyenv/versions/3.8.12/envs/DeepSearch/lib/python3.8/site-packages/gensim/models/ldamodel.py", line 767, in do_estep
    gamma, sstats = self.inference(chunk, collect_sstats=True)
  File "/home/lbamagalhaes/.pyenv/versions/3.8.12/envs/DeepSearch/lib/python3.8/site-packages/gen

KeyboardInterrupt: 

## LDA

In [21]:
from sklearn.decomposition import LatentDirichletAllocation

def LDA(token,n_components,max_iter):
    
    lda_model = LatentDirichletAllocation(n_components=n_components,max_iter=max_iter,n_jobs=-1,learning_method='online')
    lda_model.fit(token)
    
    return lda_model  

In [22]:
def topics(model,token,topwords):
    
    topic_mixture = pd.DataFrame(model.components_,columns = token.columns)
    
    n_components = topic_mixture.shape[0]
    
    for topic in range(n_components):
        print('-'*10)
        print(f"For topic {topic}, here are the top {topwords} words with weights:")
        
        topic_df = topic_mixture.iloc[topic].sort_values(ascending = False).head(topwords)
        
        print(round(topic_df,5))

## LDA 15 

In [60]:
lda_15 = ""

with parallel_backend("threading"):
    lda_15 = LDA(token,15,100)
    
lda_t_15 = lda_15.transform(token)

lda_s_15 = pd.DataFrame(lda_15.components_,columns = token.columns)

lda_s_15

Unnamed: 0,aberrant,ability,abnormal,abnormality,absence,access,accumulation,accuracy,accurate,acid,...,wildtype,window,woman,work,world,worsen,xray,year,young,younger
0,1.579128,46.793485,24.774873,17.391955,53.058212,45.174439,47.69771,425.580068,202.353139,55.084264,...,0.066677,138.807112,36.047599,47.34346,14.033748,0.066669,0.06667,80.502293,0.066679,0.066671
1,0.066667,0.06671,0.066667,0.066667,0.066671,0.066667,0.066668,0.066671,0.066667,0.066667,...,0.066668,0.06667,0.066668,0.066667,0.066669,0.06667,0.066669,0.066673,0.066668,0.066673
2,0.066671,47.413858,2.243876,0.066667,0.0667,106.807829,0.066668,0.066684,63.429375,2.491925,...,0.066667,0.06667,0.066677,166.093861,145.250736,0.066671,0.066667,176.986564,38.232805,0.066674
3,0.066667,0.066675,5.417731,0.066667,0.066672,0.066667,0.066667,0.066667,0.066731,0.066667,...,0.066668,0.066668,0.066674,0.066667,0.066683,0.066676,300.94676,0.066667,0.066674,0.066676
4,0.066667,0.06668,0.066667,0.066667,0.066672,0.066682,29.585945,0.066671,0.066668,0.066667,...,0.066669,0.066669,0.066686,0.066667,0.066671,0.066683,0.066668,0.066667,0.066669,0.06667
5,5.698169,28.88421,28.141408,36.003787,0.0667,45.187561,6.447842,0.06668,0.066676,61.01705,...,0.066667,9.256079,777.571024,87.78896,45.414033,134.388885,0.066667,816.992366,142.947102,0.066701
6,0.066668,12.157016,0.066668,0.066674,18.781942,0.06668,0.066667,0.066672,0.066685,0.066667,...,0.066667,0.0667,23.629183,0.066667,0.066668,0.066678,0.066667,0.066667,87.820877,113.207261
7,125.109771,0.066728,168.760372,152.847711,120.069622,0.066678,347.301542,0.066674,19.937791,283.803508,...,207.770893,39.219138,28.923273,22.498716,0.066683,0.06668,0.066667,97.491312,66.941091,0.066672
8,0.066667,0.06667,0.066667,0.066667,0.066669,0.066673,0.066667,0.066667,0.066667,0.066667,...,0.066669,0.066667,0.066667,0.066667,0.066676,0.066669,0.066669,0.066667,0.066674,0.066678
9,0.066667,0.066667,0.066672,0.066668,47.234031,0.066673,0.066667,23.838605,0.066672,0.066667,...,0.066668,0.066667,0.066676,0.066667,0.066675,0.066669,0.066667,8.764892,0.066667,0.066693


In [62]:
topics(lda_15,token,20)

----------
For topic 0, here are the top 20 words with weights:
image           2246.80257
method           969.25154
voltage          755.99619
tissue           745.12686
optical          739.62439
technique        655.54177
microscopy       634.78872
device           562.24732
measurement      560.25703
fluorescence     550.18901
scan             516.68934
model            515.95894
multiphoton      500.58428
resolution       497.62451
application      489.44397
detection        475.35125
study            466.74101
structure        455.12617
cell             452.80362
probe            451.82260
Name: 0, dtype: float64
----------
For topic 1, here are the top 20 words with weights:
tumor           971.91400
cancer          964.64123
dose            499.22471
radiation       421.14811
thickness       255.52472
professional    250.02916
survival        200.10479
prognosis       129.50119
surgical        125.92055
patient         109.62059
therapy          88.74994
overall          73.24

In [63]:
score_15 = pd.DataFrame(lda_t_15,index=df.index)
score_15[[0]].sort_values(by=0,ascending=False)

Unnamed: 0_level_0,0
_id,Unnamed: 1_level_1
20058907,0.871795
28426053,0.871264
28528636,0.869464
25117276,0.869464
23760022,0.869280
...,...
35353173,0.007318
33551785,0.007153
34215338,0.007092
33480172,0.007010


In [81]:
lda_topic_15 = list(score_15.columns)
lda_topic_15

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]

In [66]:
topic_15 = pd.DataFrame(score_15.idxmax(axis=1),columns=['Topic'])
topic_15

Unnamed: 0_level_0,Topic
_id,Unnamed: 1_level_1
34314384,0
33996894,12
33862118,7
33691255,7
33332038,14
...,...
35519270,0
35519265,0
35511603,0
35510871,0


In [64]:
df.loc['20058907']['abstract']

'In microelectronics and biology, many fundamental processes involve the exchange of charges between small objects, such as nanocrystals in photovoltaic blends or individual proteins in photosynthetic reactions. Because these nanoscale electronic processes strongly depend on the structure of the electroactive assemblies, a detailed understanding of these phenomena requires unraveling the relationship between the structure of the nano-object and its electronic function. Because of the fragility of the structures involved and the dynamic variance of the electric potential of each nanostructure during the charge generation and transport processes, understanding this structure-function relationship represents a great challenge. This Account discusses how our group and others have exploited scanning probe microscopy based approaches beyond imaging, particularly Kelvin probe force microscopy (KPFM), to map the potential of different nanostructures with a spatial and voltage resolution of a f

## LDA 10

In [20]:
lda_10 = ""
with parallel_backend("threading"):
    lda_10 = LDA(token)

In [21]:
lda_t_10 = lda_10.transform(token)

In [22]:
lda_s_10 = pd.DataFrame(lda_10.components_,columns = token.columns)

In [23]:
lda_s_10

Unnamed: 0,aberrant,ability,abnormal,abnormality,absence,access,accumulation,accuracy,accurate,acid,...,wildtype,window,woman,work,world,worsen,xray,year,young,younger
0,0.100165,0.100151,0.100107,0.100108,7.794707,0.1005,48.952836,13.746343,5.109564,180.089281,...,0.100197,0.100245,243.777249,58.206646,0.100715,0.100637,0.100124,51.237124,112.494963,199.742963
1,61.021342,0.100757,78.214553,42.024183,109.383683,0.100301,314.89415,0.100098,0.100287,302.31611,...,205.916771,25.751668,0.10004,0.100539,25.08407,0.100629,0.100049,13.089628,0.100216,0.10004
2,0.100462,226.010703,21.427191,0.1002,60.827819,71.457449,23.589436,531.499876,166.694922,3.88517,...,0.10004,94.128858,0.100434,351.772642,140.64791,0.100217,0.100024,233.120213,135.639649,0.100385
3,8.408496,16.255438,62.505999,90.868001,75.518001,80.448731,18.464976,109.98634,0.10052,0.100063,...,0.100042,32.586065,573.919004,0.100141,119.876136,159.135406,6.792173,1146.240432,157.915165,0.101154
4,69.878441,59.891688,84.902301,102.461807,59.036636,28.064186,0.100784,0.100312,58.351118,217.97385,...,69.23501,64.973507,0.100244,34.52196,0.101978,0.100317,0.100066,159.186975,29.693265,21.913696
5,0.100072,21.215623,17.103392,0.100202,22.190028,0.100276,54.682615,0.100643,118.853311,49.797346,...,0.100245,0.101253,0.100098,0.100186,0.100205,2.663579,294.287601,40.835298,0.100194,0.100062
6,39.498107,12.332745,39.597062,26.029622,35.09231,0.100415,0.100106,0.100068,0.100336,25.287756,...,28.336467,34.335293,0.100038,29.200851,0.100548,0.100068,0.100003,9.144176,32.214517,4.490262
7,0.100041,0.100017,0.100036,0.100073,0.100021,0.100044,7.889017,0.100057,0.100116,5.85466,...,0.100055,0.100413,0.100058,0.100235,0.100294,9.477345,0.100027,0.100023,0.100017,0.100503
8,30.280322,41.236915,122.404009,187.427484,48.506932,0.100047,20.507408,0.100088,0.100619,8.31379,...,0.100105,0.100332,60.174917,70.934322,2.914646,44.437661,0.100018,127.864046,55.822428,0.10115
9,0.100002,0.100099,0.100017,0.100066,0.10002,38.401275,0.100085,0.100262,4.772711,0.100017,...,0.100002,0.100293,0.100016,20.036217,0.100073,0.100011,0.100019,0.100029,0.100055,0.100057


In [113]:
topics(lda_10,token,100)

----------
For topic 0, here are the top 100 words with weights:
dose         848.60236
stress       804.78039
effect       685.24840
exposure     590.01471
level        579.64520
               ...    
lipid        141.09413
total        139.99795
disorder     138.31209
condition    137.52919
product      136.85991
Name: 0, Length: 100, dtype: float64
----------
For topic 1, here are the top 100 words with weights:
cell           1664.41664
mouse          1261.91419
protein        1213.26656
receptor       1115.15123
channel         970.73607
                  ...    
assay           233.09467
damage          230.84626
molecule        230.12977
subunit         229.97462
dysfunction     229.07785
Name: 1, Length: 100, dtype: float64
----------
For topic 2, here are the top 100 words with weights:
model          1015.52723
study           971.24864
social          949.54015
task            946.71033
process         876.74326
                  ...    
design          309.88487
potential 

In [36]:
score_10 = pd.DataFrame(lda_t_10,index=df.index)

In [82]:
lda_topic_10 = list(score_10.columns)
lda_topic_10

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [45]:
score_10[[5]].sort_values(by=5,ascending=False)

Unnamed: 0_level_0,5
_id,Unnamed: 1_level_1
33834437,0.862577
35665220,0.858674
28528636,0.856231
26238063,0.854759
34331281,0.853837
...,...
33551785,0.010742
34215338,0.010650
33480172,0.010615
35730515,0.010524


In [55]:
df.loc['33834437']['abstract']

'Intravital microscopy has emerged as a powerful technique for the fluorescent visualization of cellular- and subcellular-level biological processes in vivo. However, the size of objective lenses used in standard microscopes currently makes it difficult to access internal organs with minimal invasiveness in small animal models, such as mice. Here we describe front- and side-view designs for small-diameter endoscopes based on gradient-index lenses, their construction, their integration into laser scanning confocal microscopy platforms, and their applications for in vivo imaging of fluorescent cells and microvasculature in various organs, including the kidney, bladder, heart, brain, and gastrointestinal tracts, with a focus on the new techniques developed for each imaging application. The combination of novel fluorescence techniques with these powerful imaging methods promises to continue providing novel insights into a variety of diseases.'

In [41]:
topic = pd.DataFrame(score_10.idxmax(axis=1),columns=['Topic'])
topic

Unnamed: 0_level_0,Topic
_id,Unnamed: 1_level_1
34314384,8
33996894,4
33862118,8
33691255,1
33332038,8
...,...
35519270,2
35519265,5
35511603,2
35510871,2


## List of topics per LDA

In [127]:
def topic_list(model,token,topwords):
    
    topic_mixture = pd.DataFrame(model.components_,columns = token.columns)
    
    n_components = topic_mixture.shape[0]
    
    topics = []
    
    for topic in range(n_components):
        
        topic_df = topic_mixture.iloc[topic].sort_values(ascending = False).head(topwords)
        
        topics.append(list(topic_df.index))
        
    return topics

## Similarity

In [132]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import LdaModel, CoherenceModel
from gensim import corpora

topic_10_100 = topic_list(lda,token,100)
topic_15_100 = topic_list(lda_15,token,100)
num_topics = [10,15]
LDA_models = {10:lda,15:lda_15}
LDA_topics = {10:topic_10_100,15:topic_15_100}

def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))


In [133]:
LDA_stability = {}

for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        print (topic1)
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            print (topic2)
            print (jaccard_similarity(topic1, topic2))
            sims.append(jaccard_similarity(topic1, topic2))    
            
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

['dose', 'stress', 'effect', 'exposure', 'level', 'male', 'group', 'increase', 'alcohol', 'female', 'anxiety', 'radiation', 'endothelial', 'food', 'glucose', 'diet', 'liver', 'concentration', 'exercise', 'healthcare', 'weight', 'study', 'behavior', 'plasma', 'decrease', 'oxidative', 'serum', 'obesity', 'oxygen', 'parent', 'intake', 'treatment', 'social', 'gender', 'control', 'metabolic', 'antioxidant', 'mouse', 'week', 'blood', 'drug', 'consumption', 'professional', 'diabetes', 'test', 'body', 'older', 'chronic', 'reduce', 'woman', 'kidney', 'administration', 'adolescent', 'associate', 'inflammation', 'water', 'reactive', 'adult', 'baseline', 'change', 'relapse', 'animal', 'reduction', 'younger', 'administer', 'measure', 'factor', 'response', 'energy', 'hormone', 'temperature', 'activity', 'protective', 'investigate', 'acid', 'risk', 'difference', 'depression', 'inflammatory', 'acute', 'evaluate', 'metabolism', 'longterm', 'metabolite', 'daily', 'model', 'mass', 'load', 'expose', 'beha

In [149]:
dirichlet_dict = corpora.Dictionary([token])
dirichlet_dict

<gensim.corpora.dictionary.Dictionary at 0x7fd7ca98eca0>

In [147]:
coherences = [CoherenceModel(model=LDA_models[i], texts=corpus, dictionary=dirichlet_dict, coherence='c_v').get_coherence()\
              for i in num_topics[:-1]]

ValueError: This topic model is not currently supported. Supported topic models should implement the `get_topics` method.