In [1]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\Siddhanth\anaconda3\python.exe -m pip install --upgrade pip' command.


In [2]:
import pandas as pd
import numpy as np
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [3]:
data = pd.read_csv('papers.csv')
data.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [4]:
df = pd.DataFrame(data = data.paper_text, columns = ['paper_text'], index = range(len(data)))
df['index'] = range(len(data))
df.head()

Unnamed: 0,paper_text,index
0,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,0
1,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...,1
2,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...,2
3,Bayesian Query Construction for Neural\nNetwor...,3
4,"Neural Network Ensembles, Cross\nValidation, a...",4


In [5]:
df.shape

(7241, 2)

In [6]:
df.isnull().sum()

paper_text    0
index         0
dtype: int64

**Preprocessing -**

In [7]:
np.random.seed(12345)

In [8]:
import nltk

In [9]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Siddhanth\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
stemmer = SnowballStemmer('english')

In [11]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos = 'v'))

def preprocess_text(text):
    result = []
    for token in simple_preprocess(text):
        if token not in STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
            
    return result

In [12]:
processed_docs = df['paper_text'].apply(preprocess_text)
processed_docs[:15]

0     [self, organ, associ, databas, applic, hisashi...
1     [mean, field, theori, layer, visual, cortex, a...
2     [store, covari, associ, long, term, potenti, d...
3     [bayesian, queri, construct, neural, network, ...
4     [neural, network, ensembl, cross, valid, activ...
5     [sing, neural, instanti, deform, model, christ...
6     [plastic, mediat, competit, learn, terrenc, se...
7     [iceg, morpholog, classif, analogu, vlsi, neur...
8     [real, time, control, tokamak, plasma, neural,...
9     [real, time, control, tokamak, plasma, neural,...
10    [learn, play, game, chess, sebastian, thrun, u...
11    [scale, data, cluster, thoma, hofmann, joachim...
12    [experiment, comparison, recurr, neural, netwo...
13    [train, multilay, perceptron, extend, kalman, ...
14    [interfer, learn, intern, model, invers, dynam...
Name: paper_text, dtype: object

**Feature extraction through tf-idf-**

In [13]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [14]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [15]:
tfidf = gensim.models.TfidfModel(bow_corpus)

In [16]:
corpus_tfidf = tfidf[bow_corpus]

**Running LDA using tf-idf model -**

In [17]:
num_topics = 10

In [18]:
lda_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics = num_topics, id2word = dictionary, passes = 2, workers = 4)

In [19]:
for idx, topic in lda_tfidf.print_topics(-1):
    print(f"Topic : {idx} \n Words {topic} \n")

Topic : 0 
 Words 0.001*"imag" + 0.001*"rank" + 0.001*"kernel" + 0.001*"cluster" + 0.001*"polici" + 0.001*"matrix" + 0.001*"latent" + 0.001*"posterior" + 0.001*"label" + 0.001*"train" 

Topic : 1 
 Words 0.001*"polici" + 0.001*"action" + 0.001*"reward" + 0.001*"neuron" + 0.001*"spike" + 0.001*"imag" + 0.001*"network" + 0.001*"agent" + 0.001*"kernel" + 0.001*"train" 

Topic : 2 
 Words 0.001*"kernel" + 0.001*"imag" + 0.000*"layer" + 0.000*"convex" + 0.000*"network" + 0.000*"classifi" + 0.000*"lasso" + 0.000*"theorem" + 0.000*"train" + 0.000*"featur" 

Topic : 3 
 Words 0.001*"regret" + 0.001*"imag" + 0.001*"cluster" + 0.001*"label" + 0.001*"bandit" + 0.001*"polici" + 0.001*"queri" + 0.001*"rank" + 0.001*"reward" + 0.001*"kernel" 

Topic : 4 
 Words 0.001*"kernel" + 0.001*"imag" + 0.001*"cluster" + 0.001*"polici" + 0.001*"graph" + 0.001*"label" + 0.001*"network" + 0.001*"manifold" + 0.001*"layer" + 0.001*"train" 

Topic : 5 
 Words 0.002*"neuron" + 0.001*"spike" + 0.001*"imag" + 0.001*"l

**Common words between each pair of topics-**

In [20]:
topic_dict = {}
for idx, topic in lda_tfidf.print_topics(-1):
    topic_dict[idx] = topic

topic_dict

{0: '0.001*"imag" + 0.001*"rank" + 0.001*"kernel" + 0.001*"cluster" + 0.001*"polici" + 0.001*"matrix" + 0.001*"latent" + 0.001*"posterior" + 0.001*"label" + 0.001*"train"',
 1: '0.001*"polici" + 0.001*"action" + 0.001*"reward" + 0.001*"neuron" + 0.001*"spike" + 0.001*"imag" + 0.001*"network" + 0.001*"agent" + 0.001*"kernel" + 0.001*"train"',
 2: '0.001*"kernel" + 0.001*"imag" + 0.000*"layer" + 0.000*"convex" + 0.000*"network" + 0.000*"classifi" + 0.000*"lasso" + 0.000*"theorem" + 0.000*"train" + 0.000*"featur"',
 3: '0.001*"regret" + 0.001*"imag" + 0.001*"cluster" + 0.001*"label" + 0.001*"bandit" + 0.001*"polici" + 0.001*"queri" + 0.001*"rank" + 0.001*"reward" + 0.001*"kernel"',
 4: '0.001*"kernel" + 0.001*"imag" + 0.001*"cluster" + 0.001*"polici" + 0.001*"graph" + 0.001*"label" + 0.001*"network" + 0.001*"manifold" + 0.001*"layer" + 0.001*"train"',
 5: '0.002*"neuron" + 0.001*"spike" + 0.001*"imag" + 0.001*"layer" + 0.001*"cluster" + 0.001*"network" + 0.001*"kernel" + 0.001*"synaps" + 

In [21]:
from itertools import combinations
import regex as re

In [22]:
def topic_words(topic):
    topics = re.findall("\D{4,}", topic)
    return list(map(lambda x: x[2: x.find('"', x.find('"') + 1)], topics))

In [23]:
for combination in combinations(list(range(10)), 2):
    common_words = set(topic_words(topic_dict[combination[0]])).intersection(set(topic_words(topic_dict[combination[1]])))
    print(f"Topic {combination[0]} and {combination[1]} : {len(common_words)}", end = "\n")
    print(f"Common words - {common_words}", end = "\n")
    print("\n")

Topic 0 and 1 : 4
Common words - {'train', 'imag', 'kernel', 'polici'}


Topic 0 and 2 : 3
Common words - {'kernel', 'train', 'imag'}


Topic 0 and 3 : 6
Common words - {'label', 'rank', 'cluster', 'imag', 'kernel', 'polici'}


Topic 0 and 4 : 6
Common words - {'label', 'cluster', 'train', 'imag', 'kernel', 'polici'}


Topic 0 and 5 : 3
Common words - {'cluster', 'imag', 'kernel'}


Topic 0 and 6 : 5
Common words - {'rank', 'cluster', 'train', 'imag', 'kernel'}


Topic 0 and 7 : 6
Common words - {'rank', 'matrix', 'train', 'imag', 'kernel', 'polici'}


Topic 0 and 8 : 4
Common words - {'cluster', 'imag', 'kernel', 'polici'}


Topic 0 and 9 : 4
Common words - {'kernel', 'cluster', 'train', 'imag'}


Topic 1 and 2 : 4
Common words - {'network', 'train', 'imag', 'kernel'}


Topic 1 and 3 : 4
Common words - {'imag', 'reward', 'kernel', 'polici'}


Topic 1 and 4 : 5
Common words - {'train', 'network', 'imag', 'kernel', 'polici'}


Topic 1 and 5 : 5
Common words - {'spike', 'neuron', 'networ

**Evaluating the topic model -**

In [24]:
from gensim.models import CoherenceModel
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_tfidf, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.21785028975304832


Higher the coherence score , better it is

**Making a different model to get a higher coherence score -**

Increasing the number of passes as well as the chunksize - 

In [25]:
lda_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics = num_topics, id2word = dictionary, passes = 15, workers = 4, 
                                       per_word_topics = True, chunksize = 3000)

In [26]:
for idx, topic in lda_tfidf.print_topics(-1):
    print(f"Topic : {idx} \n Words {topic} \n")

Topic : 0 
 Words 0.000*"minwis" + 0.000*"eld" + 0.000*"gamp" + 0.000*"wedg" + 0.000*"abstent" + 0.000*"nnbm" + 0.000*"romma" + 0.000*"acnn" + 0.000*"diabolo" + 0.000*"recollect" 

Topic : 1 
 Words 0.000*"mwis" + 0.000*"ltsa" + 0.000*"miso" + 0.000*"mcboost" + 0.000*"cann" + 0.000*"oasi" + 0.000*"epitom" + 0.000*"ggood" + 0.000*"adex" + 0.000*"lpboost" 

Topic : 2 
 Words 0.000*"mimo" + 0.000*"lissom" + 0.000*"pointset" + 0.000*"epic" + 0.000*"gnkr" + 0.000*"omdp" + 0.000*"srcut" + 0.000*"lsdd" + 0.000*"gibbsnet" + 0.000*"adclus" 

Topic : 3 
 Words 0.000*"churn" + 0.000*"mirna" + 0.000*"forgeri" + 0.000*"diskmean" + 0.000*"isomer" + 0.000*"glas" + 0.000*"vamp" + 0.000*"href" + 0.000*"gape" + 0.000*"sgns" 

Topic : 4 
 Words 0.000*"flanker" + 0.000*"mmsb" + 0.000*"zeta" + 0.000*"kcca" + 0.000*"gpfa" + 0.000*"lgssm" + 0.000*"vine" + 0.000*"conv_" + 0.000*"decontamin" + 0.000*"dapt" 

Topic : 5 
 Words 0.002*"kernel" + 0.002*"imag" + 0.002*"neuron" + 0.002*"cluster" + 0.002*"polici" + 0

In [27]:
topic_dict = {}
for idx, topic in lda_tfidf.print_topics(-1):
    topic_dict[idx] = topic

In [28]:
for combination in combinations(list(range(10)), 2):
    common_words = set(topic_words(topic_dict[combination[0]])).intersection(set(topic_words(topic_dict[combination[1]])))
    print(f"Topic {combination[0]} and {combination[1]} : {len(common_words)}", end = "\n")
    print(f"Common words - {common_words}", end = "\n")
    print("\n")

Topic 0 and 1 : 0
Common words - set()


Topic 0 and 2 : 0
Common words - set()


Topic 0 and 3 : 0
Common words - set()


Topic 0 and 4 : 0
Common words - set()


Topic 0 and 5 : 0
Common words - set()


Topic 0 and 6 : 0
Common words - set()


Topic 0 and 7 : 0
Common words - set()


Topic 0 and 8 : 0
Common words - set()


Topic 0 and 9 : 0
Common words - set()


Topic 1 and 2 : 0
Common words - set()


Topic 1 and 3 : 0
Common words - set()


Topic 1 and 4 : 0
Common words - set()


Topic 1 and 5 : 0
Common words - set()


Topic 1 and 6 : 0
Common words - set()


Topic 1 and 7 : 0
Common words - set()


Topic 1 and 8 : 0
Common words - set()


Topic 1 and 9 : 0
Common words - set()


Topic 2 and 3 : 0
Common words - set()


Topic 2 and 4 : 0
Common words - set()


Topic 2 and 5 : 0
Common words - set()


Topic 2 and 6 : 0
Common words - set()


Topic 2 and 7 : 0
Common words - set()


Topic 2 and 8 : 0
Common words - set()


Topic 2 and 9 : 0
Common words - set()


Topic 3 and 4 : 

In [29]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_tfidf, texts=processed_docs, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.6227777987406713


## less no of common words and high scores show that the topics have very little in common