In [1]:
# Importing modules
import pandas as pd

# Read data into papers
papers = pd.read_csv('papers.csv')
# Print head
papers.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."


In [2]:
# Data Cleaning

# Remove the columns
papers = papers.drop(columns=['id', 'title', 'abstract', 
                              'event_type', 'pdf_name', 'year'], axis=1)
# sample only 10 papers - for demonstration purposes
papers = papers.sample(10)

# Print out the first rows of papers
papers.head()

Unnamed: 0,paper_text
6450,Fast Black-box Variational Inference\nthrough ...
6255,Adversarial Surrogate Losses for Ordinal Regre...
1837,Maximal Margin Labeling for Multi-Topic Text\n...
4220,Application of Neural Network Methodology to\n...
1610,When Does Non-Negative Matrix Factorization\nG...


In [3]:
# Remove punctuation/lower casing

# Load the regular expression library
import re

# Remove punctuation
papers['paper_text_processed'] = papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
papers['paper_text_processed'] = papers['paper_text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
papers['paper_text_processed'].head()

6450    fast black-box variational inference\nthrough ...
6255    adversarial surrogate losses for ordinal regre...
1837    maximal margin labeling for multi-topic text\n...
4220    application of neural network methodology to\n...
1610    when does non-negative matrix factorization\ng...
Name: paper_text_processed, dtype: object

In [5]:
!pip install gensim

Collecting gensim
  Downloading gensim-3.8.3-cp37-cp37m-win_amd64.whl (24.2 MB)
Collecting Cython==0.29.14
  Downloading Cython-0.29.14-cp37-cp37m-win_amd64.whl (1.7 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-2.1.0.tar.gz (116 kB)
Collecting boto3
  Downloading boto3-1.14.29-py2.py3-none-any.whl (128 kB)
Collecting s3transfer<0.4.0,>=0.3.0
  Downloading s3transfer-0.3.3-py2.py3-none-any.whl (69 kB)
Collecting jmespath<1.0.0,>=0.7.1
  Downloading jmespath-0.10.0-py2.py3-none-any.whl (24 kB)
Collecting botocore<1.18.0,>=1.17.29
  Downloading botocore-1.17.29-py2.py3-none-any.whl (6.4 MB)
Collecting docutils<0.16,>=0.10
  Downloading docutils-0.15.2-py3-none-any.whl (547 kB)
Building wheels for collected packages: smart-open
  Building wheel for smart-open (setup.py): started
  Building wheel for smart-open (setup.py): finished with status 'done'
  Created wheel for smart-open: filename=smart_open-2.1.0-py3-none-any.whl size=110324 sha256=8ddf10ccb3634bc93d42cc6a2756a581dde

In [4]:
# Tokenize words and further clean-up text
# tokenize each sentence into a list of words, removing punctuations and unnecessary characters altogether.
import gensim
from gensim.utils import simple_preprocess

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])

[['fast', 'black', 'box', 'variational', 'inference', 'through', 'stochastic', 'trust', 'region', 'optimization', 'jeffrey', 'regier', 'jregier', 'csberkeleyedu', 'michael', 'jordan', 'jordan', 'csberkeleyedu', 'jon', 'mcauliffe', 'jon', 'statberkeleyedu', 'abstract', 'we', 'introduce', 'trustvi', 'fast', 'second', 'order', 'algorithm', 'for', 'black', 'box', 'variational', 'inference', 'based', 'on', 'trust', 'region', 'optimization', 'and', 'the', 'trick', 'at', 'each', 'iteration', 'trustvi', 'proposes', 'and', 'assesses', 'step', 'based', 'on', 'minibatches', 'of', 'draws', 'from', 'the', 'variational', 'distribution', 'the', 'algorithm', 'provably', 'converges', 'to', 'stationary', 'point', 'we', 'implemented', 'trustvi', 'in', 'the', 'stan', 'framework', 'and', 'compared', 'it', 'to', 'two', 'alternatives', 'automatic', 'differentiation', 'variational', 'inference', 'advi', 'and', 'hessianfree', 'stochastic', 'gradient', 'variational', 'inference', 'hfsgvi', 'the', 'former', 'is'

In [5]:
# Build the bigram and trigram models

bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [6]:
# Remove Stopwords, Make Bigrams and Lemmatize

# NLTK Stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

# Define functions for stopwords, bigrams, trigrams and lemmatization

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AYAN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import spacy
from nltk.stem import WordNetLemmatizer 
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
#nlp = en_core_web_sm.load()
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Do lemmatization 
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
print(data_lemmatized[:1])


[['fast', 'stochastic', 'optimization', 'csberkeleyedu', 'abstract', 'introduce', 'trustvi', 'fast', 'second', 'order', 'base', 'trust_region', 'optimization', 'trick', 'iteration', 'trustvi', 'propose', 'assesse', 'step', 'base', 'draw', 'variational', 'distribution', 'algorithm', 'provably', 'converge', 'implement', 'trustvi', 'framework', 'compare', 'alternative', 'advi', 'hessianfree', 'former', 'base', 'stochastic', 'first', 'order', 'optimization', 'latter', 'use', 'second', 'order', 'information', 'lack', 'convergence', 'guarantee', 'trustvi', 'typically', 'converge', 'least', 'order', 'magnitude', 'fast', 'advi', 'demonstrating', 'value', 'stochastic', 'second', 'order', 'information', 'trustvi', 'often', 'find', 'substantially', 'well', 'variational', 'distribution', 'demonstrate', 'convergence', 'theory', 'matter', 'practice', 'introduction', 'trick', 'lead', 'resurgence', 'interest', 'make', 'applicable', 'essentially', 'differentiable', 'model', 'new', 'approach', 'however'

In [10]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 14), (2, 5), (3, 1), (4, 2), (5, 1), (6, 6), (7, 1), (8, 1), (9, 1), (10, 1), (11, 2), (12, 2), (13, 18), (14, 2), (15, 9), (16, 2), (17, 1), (18, 6), (19, 1), (20, 1), (21, 7), (22, 3), (23, 5), (24, 1), (25, 1), (26, 3), (27, 2), (28, 1), (29, 1), (30, 5), (31, 3), (32, 1), (33, 7), (34, 6), (35, 1), (36, 1), (37, 2), (38, 1), (39, 7), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 2), (48, 1), (49, 1), (50, 1), (51, 2), (52, 10), (53, 17), (54, 2), (55, 3), (56, 4), (57, 1), (58, 1), (59, 1), (60, 7), (61, 1), (62, 1), (63, 1), (64, 1), (65, 6), (66, 1), (67, 1), (68, 1), (69, 1), (70, 4), (71, 1), (72, 2), (73, 1), (74, 2), (75, 1), (76, 2), (77, 2), (78, 7), (79, 2), (80, 4), (81, 2), (82, 1), (83, 2), (84, 1), (85, 1), (86, 1), (87, 2), (88, 1), (89, 6), (90, 4), (91, 2), (92, 2), (93, 1), (94, 4), (95, 2), (96, 3), (97, 2), (98, 1), (99, 25), (100, 2), (101, 1), (102, 1), (103, 2), (104, 2), (105, 7), (106, 1), (107, 4), (108, 1), (109, 1), (11

In [11]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=10, 
                                       random_state=100,
                                       chunksize=100,
                                       passes=10,
                                       per_word_topics=True)

In [12]:
# We can see the keywords for each topic and the weightage(importance) of each keyword using lda_model.print_topics()

from pprint import pprint

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.023*"contain" + 0.019*"image" + 0.015*"generator" + 0.014*"part" + '
  '0.013*"point" + 0.012*"articulation" + 0.011*"dual" + 0.010*"datum" + '
  '0.009*"also" + 0.008*"set"'),
 (1,
  '0.023*"model" + 0.021*"trustvi" + 0.017*"iteration" + 0.013*"optimization" '
  '+ 0.012*"step" + 0.011*"objective" + 0.010*"condition" + 0.010*"sample" + '
  '0.010*"stochastic" + 0.009*"distribution"'),
 (2,
  '0.043*"model" + 0.025*"datum" + 0.016*"temperature" + 0.015*"set" + '
  '0.013*"method" + 0.012*"give" + 0.010*"base" + 0.010*"use" + '
  '0.010*"prediction" + 0.009*"well"'),
 (3,
  '0.019*"learn" + 0.015*"feedback" + 0.015*"module" + 0.013*"time" + '
  '0.011*"hand" + 0.011*"movement" + 0.010*"motor" + 0.009*"delay" + '
  '0.009*"target" + 0.009*"example"'),
 (4,
  '0.001*"model" + 0.001*"learn" + 0.001*"use" + 0.001*"function" + '
  '0.001*"set" + 0.001*"loss" + 0.001*"time" + 0.001*"value" + 0.001*"task" + '
  '0.001*"example"'),
 (5,
  '0.019*"function" + 0.015*"value" + 0.012*"sta

In [13]:
from gensim.models import CoherenceModel

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.4556192879079294


In [14]:
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=8, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=0.01,
                                           eta=0.9)

In [72]:
!pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-2.1.2.tar.gz (1.6 MB)
Collecting funcy
  Downloading funcy-1.14.tar.gz (548 kB)
Building wheels for collected packages: pyLDAvis, funcy
  Building wheel for pyLDAvis (setup.py): started
  Building wheel for pyLDAvis (setup.py): finished with status 'done'
  Created wheel for pyLDAvis: filename=pyLDAvis-2.1.2-py2.py3-none-any.whl size=97716 sha256=4ddbff9bc0761cdb5cac8692fb7c1c47a47a0a3ffafafd7595056f4c93bfd852
  Stored in directory: c:\users\ayan\appdata\local\pip\cache\wheels\3b\fb\41\e32e5312da9f440d34c4eff0d2207b46dc9332a7b931ef1e89
  Building wheel for funcy (setup.py): started
  Building wheel for funcy (setup.py): finished with status 'done'
  Created wheel for funcy: filename=funcy-1.14-py2.py3-none-any.whl size=32045 sha256=588671cf986db8999ed1b2877e9445feb8d447f583856b3cbc68741e81c8679f
  Stored in directory: c:\users\ayan\appdata\local\pip\cache\wheels\3c\33\97\805b282e129f60bb4e87cea622338f30b65f21eaf65219971f
Successfully built pyL

In [73]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
LDAvis_prepared