In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import dtale
import pymongo
import csv
import scispacy
import spacy
from sklearn.utils import parallel_backend   
nlp = spacy.load("en_core_sci_lg")
warnings.filterwarnings('ignore')

In [2]:
import gensim.corpora as corpora

## Base functions

In [3]:
def get_data():

    """import data from MongoDB"""

    myclient = pymongo.MongoClient("mongodb+srv://lucas-deepen:DSIqP935gtFobYc2@cluster0.ixkyxa7.mongodb.net/?retryWrites=true&w=majority")
    mydb = myclient["cleanpapers"]
    mycol = mydb["cleanedf"]
    mydoc = mycol.find({}, {"_id":1,"articleTitle":1,"abstract":1,"pubDate":1,"affiliations":1})

    print('----------Data imported----------')

    return mydoc

In [4]:
def dataframe(mydoc,length=132820):

    """convert mongodb data to dataframe (full = 132820 rows)"""
    
    # data to dataframe and limit length

    df = pd.DataFrame(list(mydoc)).set_index(['_id'])

    df = df[df.abstract != '.'].iloc[:length,:]

    # extract year from the pubDate column

    df['pubDate'] = df['pubDate'].str.extract(r'(\d{4})')

    print ('----------DataFrame created----------')

    print (df.head(15))

    return df

In [5]:
def cleaning(text):

    """cleaning function for the abstract"""
    
    # extract medical terms
      
    doc = nlp(text)
    
    doc_string = " ".join(str(a) for a in doc.ents)

    # transform abtract words into lower case

    words = doc_string.lower()

    # remove punctuations

    for punctuation in string.punctuation:

        words = words.replace(punctuation,'')

    # remove digits

    words = ''.join(char for char in words if not char.isdigit())

    # tokenize sentences

    tokenized_text = word_tokenize(words)

    # remove stop words

    stop_words = set(stopwords.words('english'))


    tokenized_sentence_cleaned = [w for w in tokenized_text
                                if not w in stop_words]

    # standardize verbs

    verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")
            for word in tokenized_sentence_cleaned]

    # standardize nouns

    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
            for word in verb_lemmatized]
    
    # only words longer than 3 charachters:
    
    length_3 = [ word for word in noun_lemmatized if len(word) > 3 ]
    
    # re-join list into sentence

    cleaned_txt = " ".join(length_3)

    return cleaned_txt

In [6]:
def cleaning_ginsem(text):

    """cleaning function for the abstract"""
    
    # extract medical terms
      
    doc = nlp(text)
    
    doc_string = " ".join(str(a) for a in doc.ents)

    # transform abtract words into lower case

    words = doc_string.lower()

    # remove punctuations

    for punctuation in string.punctuation:

        words = words.replace(punctuation,'')

    # remove digits

    words = ''.join(char for char in words if not char.isdigit())

    # tokenize sentences

    tokenized_text = word_tokenize(words)

    # remove stop words

    stop_words = set(stopwords.words('english'))


    tokenized_sentence_cleaned = [w for w in tokenized_text
                                if not w in stop_words]

    # standardize verbs

    verb_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "v")
            for word in tokenized_sentence_cleaned]

    # standardize nouns

    noun_lemmatized = [WordNetLemmatizer().lemmatize(word, pos = "n")  # n --> nouns
            for word in verb_lemmatized]
    
    # only words longer than 3 charachters:
    
    length_3 = [ word for word in noun_lemmatized if len(word) > 3 ]
    
    return length_3

In [7]:
def clean(df):
    
    """clean abstract"""

    df_ = df.copy()

    # apply clean function to abstracts

    df_.abstract = df_.abstract.astype(str).apply(cleaning)
    
    return df_

In [8]:
def clean_ginsem(df):
    
    """clean abstract"""

    df_ = df.copy()

    # apply clean function to abstracts

    df_.abstract = df_.abstract.astype(str).apply(cleaning_ginsem)
    
    return df_

In [9]:
def tokenize(df):

    """generate tokenized dataframe"""

    # intitialize vectorizer model

    tfidf_vectorizer = TfidfVectorizer(use_idf=True,
                                analyzer='word',
                                stop_words='english',
                                max_df=0.6,min_df=0.01)#,
                                #max_features=10000)

    # fit_transform abstract

    tfidf_abstract = tfidf_vectorizer.fit_transform(df.abstract)

    # create data frame with columns names

    weighted_words = pd.DataFrame(tfidf_abstract.toarray(),
                columns = tfidf_vectorizer.get_feature_names(),index=df.index).round(2)

    print ('----------Abstract tokenized----------')

    print (weighted_words.head(15))

    return weighted_words

## Tokenize

In [10]:
data = get_data()

----------Data imported----------


In [11]:
df = dataframe(data)

----------DataFrame created----------
                                                   abstract  \
_id                                                           
34314384  Intracortical microelectrode arrays (MEA) can ...   
33996894  Medulloblastoma is the most common malignant c...   
33862118  Nod-like receptor family pyrin domain containi...   
33691255  Mice with chronic cochlear implants can signif...   
33332038  An Auditory Brainstem Implant (ABI) is a techn...   
31201186  Tinnitus may have a very severe impact on the ...   
35509538  Manufacturing of customized three-dimensional ...   
35024600  Injectable hydrogel has the advantage to fill ...   
34425566  The evaluation of the long-term stability of E...   
33762926  Mitochondria are organelles responsible for bi...   
33647494  Evolutions in cranioplasty have allowed for th...   
33431445  A 42-year-old woman presented with fever, left...   
33318954  An estimated 3.8 million traumatic brain injur...   
33025785  Modern 

In [12]:
clean_abstract = clean(df)

In [13]:
token = tokenize(clean_abstract)

----------Abstract tokenized----------
          aberrant  ability  abnormal  abnormality  absence  access  \
_id                                                                   
34314384       0.0      0.0      0.00          0.0      0.0     0.0   
33996894       0.0      0.0      0.00          0.0      0.0     0.0   
33862118       0.0      0.0      0.00          0.0      0.0     0.0   
33691255       0.0      0.0      0.00          0.0      0.0     0.0   
33332038       0.0      0.0      0.00          0.0      0.0     0.0   
31201186       0.0      0.0      0.00          0.0      0.0     0.0   
35509538       0.0      0.0      0.00          0.0      0.0     0.0   
35024600       0.0      0.0      0.00          0.0      0.0     0.0   
34425566       0.0      0.0      0.00          0.0      0.0     0.0   
33762926       0.0      0.0      0.09          0.0      0.0     0.0   
33647494       0.0      0.0      0.00          0.0      0.0     0.0   
33431445       0.0      0.0      0.00 

In [14]:
token

Unnamed: 0_level_0,aberrant,ability,abnormal,abnormality,absence,access,accumulation,accuracy,accurate,acid,...,wildtype,window,woman,work,world,worsen,xray,year,young,younger
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
34314384,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33996894,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33862118,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33691255,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33332038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35519270,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35519265,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35511603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
35510871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## LDA ginsem

In [15]:
import gensim
from gensim.models import TfidfModel, LdaMulticore

In [16]:
text = list(clean_abstract.abstract.str.split())

In [17]:
id2word = corpora.Dictionary(text)

In [18]:
corpus = [id2word.doc2bow(text) for text in text]

In [19]:
tfidf = TfidfModel(corpus)

In [20]:
tfidf_corpus = tfidf[corpus[0]]

In [None]:
lda_ginsem = LdaMulticore(tfidf_corpus, id2word=id2word, num_topics=10,workers=3)

Process SpawnPoolWorker-1:
Traceback (most recent call last):
  File "/Users/patrickwestermann/.pyenv/versions/3.8.13/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/Users/patrickwestermann/.pyenv/versions/3.8.13/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/patrickwestermann/.pyenv/versions/3.8.13/lib/python3.8/multiprocessing/pool.py", line 109, in worker
    initializer(*initargs)
  File "/Users/patrickwestermann/.pyenv/versions/3.8.13/envs/DeepSearch/lib/python3.8/site-packages/gensim/models/ldamulticore.py", line 346, in worker_e_step
    worker_lda.do_estep(chunk)  # TODO: auto-tune alpha?
  File "/Users/patrickwestermann/.pyenv/versions/3.8.13/envs/DeepSearch/lib/python3.8/site-packages/gensim/models/ldamodel.py", line 767, in do_estep
    gamma, sstats = self.inference(chunk, collect_sstats=True)
  File "/Users/patrickwestermann/.pyenv/versions/3.8.13/envs/DeepS

## LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation

def LDA(token,n_components,max_iter):
    
    lda_model = LatentDirichletAllocation(n_components=n_components,max_iter=max_iter,n_jobs=-1,learning_method='online')
    lda_model.fit(token)
    
    return lda_model  

In [None]:
def topics(model,token,topwords):
    
    topic_mixture = pd.DataFrame(model.components_,columns = token.columns)
    
    n_components = topic_mixture.shape[0]
    
    for topic in range(n_components):
        print('-'*10)
        print(f"For topic {topic}, here are the top {topwords} words with weights:")
        
        topic_df = topic_mixture.iloc[topic].sort_values(ascending = False).head(topwords)
        
        print(round(topic_df,5))

## LDA 15 

In [None]:
lda_15 = ""

with parallel_backend("threading"):
    lda_15 = LDA(token,15,100)
    
lda_t_15 = lda_15.transform(token)

lda_s_15 = pd.DataFrame(lda_15.components_,columns = token.columns)

lda_s_15

In [None]:
topics(lda_15,token,20)

In [None]:
score_15 = pd.DataFrame(lda_t_15,index=df.index)
score_15[[0]].sort_values(by=0,ascending=False)

In [None]:
lda_topic_15 = list(score_15.columns)
lda_topic_15

In [None]:
topic_15 = pd.DataFrame(score_15.idxmax(axis=1),columns=['Topic'])
topic_15

In [None]:
df.loc['20058907']['abstract']

## LDA 10

In [None]:
lda_10 = ""
with parallel_backend("threading"):
    lda_10 = LDA(token)

In [None]:
lda_t_10 = lda_10.transform(token)

In [None]:
lda_s_10 = pd.DataFrame(lda_10.components_,columns = token.columns)

In [None]:
lda_s_10

In [None]:
topics(lda_10,token,100)

In [None]:
score_10 = pd.DataFrame(lda_t_10,index=df.index)

In [None]:
lda_topic_10 = list(score_10.columns)
lda_topic_10

In [None]:
score_10[[5]].sort_values(by=5,ascending=False)

In [None]:
df.loc['33834437']['abstract']

In [None]:
topic = pd.DataFrame(score_10.idxmax(axis=1),columns=['Topic'])
topic

## List of topics per LDA

In [None]:
def topic_list(model,token,topwords):
    
    topic_mixture = pd.DataFrame(model.components_,columns = token.columns)
    
    n_components = topic_mixture.shape[0]
    
    topics = []
    
    for topic in range(n_components):
        
        topic_df = topic_mixture.iloc[topic].sort_values(ascending = False).head(topwords)
        
        topics.append(list(topic_df.index))
        
    return topics

## Similarity

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from gensim.models import LdaModel, CoherenceModel
from gensim import corpora

topic_10_100 = topic_list(lda,token,100)
topic_15_100 = topic_list(lda_15,token,100)
num_topics = [10,15]
LDA_models = {10:lda,15:lda_15}
LDA_topics = {10:topic_10_100,15:topic_15_100}

def jaccard_similarity(topic_1, topic_2):
    """
    Derives the Jaccard similarity of two topics

    Jaccard similarity:
    - A statistic used for comparing the similarity and diversity of sample sets
    - J(A,B) = (A ∩ B)/(A ∪ B)
    - Goal is low Jaccard scores for coverage of the diverse elements
    """
    intersection = set(topic_1).intersection(set(topic_2))
    union = set(topic_1).union(set(topic_2))
                    
    return float(len(intersection))/float(len(union))


In [None]:
LDA_stability = {}

for i in range(0, len(num_topics)-1):
    jaccard_sims = []
    for t1, topic1 in enumerate(LDA_topics[num_topics[i]]): # pylint: disable=unused-variable
        sims = []
        print (topic1)
        for t2, topic2 in enumerate(LDA_topics[num_topics[i+1]]): # pylint: disable=unused-variable
            print (topic2)
            print (jaccard_similarity(topic1, topic2))
            sims.append(jaccard_similarity(topic1, topic2))    
            
        jaccard_sims.append(sims)    
    
    LDA_stability[num_topics[i]] = jaccard_sims
                
mean_stabilities = [np.array(LDA_stability[i]).mean() for i in num_topics[:-1]]

In [None]:
dirichlet_dict = corpora.Dictionary([token])
dirichlet_dict

In [None]:
coherences = [CoherenceModel(model=LDA_models[i], texts=corpus, dictionary=dirichlet_dict, coherence='c_v').get_coherence()\
              for i in num_topics[:-1]]