# Concept Extraction and Topic Modeling

In [1]:
import pandas as pd
import re
import gensim
from gensim.utils import simple_preprocess
import nltk
from nltk.corpus import stopwords
import gensim.corpora as corpora
from pprint import pprint

## Text Preprocessing using NIPS papers dataset

In [2]:
papers = pd.read_csv('./data/NIPSPapers/papers.csv')
papers = papers.drop(columns=['id', 'event_type', 'pdf_name'])
papers['text'] = papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x))
papers['text'] = papers['text'].map(lambda x: x.lower())
papers.head()

Unnamed: 0,year,title,abstract,paper_text,text
0,1987,Self-Organization of Associative Database and ...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...,767\n\nself-organization of associative databa...
1,1987,A Mean Field Theory of Layer IV of Visual Cort...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...,683\n\na mean field theory of layer iv of visu...
2,1988,Storing Covariance by the Associative Long-Ter...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...,394\n\nstoring covariance by the associative\n...
3,1994,Bayesian Query Construction for Neural Network...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...,bayesian query construction for neural\nnetwor...
4,1994,"Neural Network Ensembles, Cross Validation, an...",Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a...",neural network ensembles cross\nvalidation and...


In [3]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]

data = papers.text.values.tolist()
data_words = list(sent_to_words(data))
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['self', 'organization', 'associative', 'database', 'applications', 'hisashi', 'suzuki', 'suguru', 'arimoto', 'osaka', 'university', 'toyonaka', 'osaka', 'japan', 'abstract', 'efficient', 'method', 'self', 'organizing', 'associative', 'databases', 'proposed', 'together', 'applications', 'robot', 'eyesight', 'systems', 'proposed', 'databases', 'associate']


In [4]:
id2word = corpora.Dictionary(data_words)
texts = data_words
corpus = [id2word.doc2bow(text) for text in texts]
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 6), (6, 1), (7, 1), (8, 3), (9, 1), (10, 2), (11, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 4), (18, 8), (19, 1), (20, 1), (21, 2), (22, 2), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1)]


## Latent Dirichlet Allocation (LDA)

In [5]:
num_topics = 5
lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=num_topics)
doc_lda = lda_model[corpus]

In [6]:
pprint(lda_model.print_topics())

[(0,
  '0.004*"set" + 0.004*"problem" + 0.004*"using" + 0.004*"model" + '
  '0.004*"learning" + 0.004*"two" + 0.003*"algorithm" + 0.003*"function" + '
  '0.003*"one" + 0.003*"neural"'),
 (1,
  '0.007*"learning" + 0.007*"data" + 0.007*"model" + 0.005*"using" + '
  '0.004*"set" + 0.004*"one" + 0.004*"algorithm" + 0.004*"function" + '
  '0.003*"models" + 0.003*"distribution"'),
 (2,
  '0.007*"model" + 0.006*"data" + 0.006*"algorithm" + 0.005*"learning" + '
  '0.005*"set" + 0.004*"function" + 0.004*"using" + 0.004*"one" + '
  '0.004*"figure" + 0.003*"also"'),
 (3,
  '0.006*"learning" + 0.005*"data" + 0.005*"network" + 0.005*"time" + '
  '0.004*"model" + 0.004*"algorithm" + 0.004*"two" + 0.004*"number" + '
  '0.004*"using" + 0.004*"set"'),
 (4,
  '0.008*"model" + 0.008*"learning" + 0.005*"data" + 0.005*"function" + '
  '0.005*"algorithm" + 0.004*"set" + 0.004*"time" + 0.004*"one" + 0.004*"log" '
  '+ 0.004*"two"')]


In [7]:
import pyLDAvis.gensim
import pickle 
import pyLDAvis
import os

In [8]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./data/LDA_results/lda_' + str(num_topics))

  and should_run_async(code)


In [9]:
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

  and should_run_async(code)


In [10]:
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
    
pyLDAvis.save_html(LDAvis_prepared, './data/LDA_results/lda_' + str(num_topics) +'.html')

  and should_run_async(code)


In [11]:
LDAvis_prepared

  and should_run_async(code)


## Hierarchical Dirichlet Process

In [12]:
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models import HdpModel

  and should_run_async(code)


In [21]:
hdp = gensim.models.HdpModel(corpus=corpus, id2word=id2word)

  and should_run_async(code)


In [22]:
hdp.print_topics(num_topics=5, num_words=10)

  and should_run_async(code)


[(0,
  '0.006*model + 0.006*learning + 0.006*data + 0.005*algorithm + 0.004*set + 0.004*function + 0.004*using + 0.004*one + 0.003*time + 0.003*two'),
 (1,
  '0.006*learning + 0.006*model + 0.005*data + 0.005*algorithm + 0.004*one + 0.004*function + 0.004*set + 0.004*using + 0.004*time + 0.003*training'),
 (2,
  '0.006*learning + 0.005*network + 0.005*data + 0.005*figure + 0.004*model + 0.004*time + 0.004*set + 0.004*one + 0.004*neural + 0.004*algorithm'),
 (3,
  '0.005*learning + 0.005*model + 0.004*data + 0.004*figure + 0.004*using + 0.004*function + 0.004*one + 0.004*set + 0.004*network + 0.004*time'),
 (4,
  '0.005*model + 0.004*figure + 0.004*input + 0.004*neural + 0.004*learning + 0.003*networks + 0.003*data + 0.003*time + 0.003*wavelet + 0.003*set')]

In [23]:
# Visualize the topics
LDAvis_data_filepath = os.path.join('./data/LDA_results/hlda_' + str(num_topics))

  and should_run_async(code)


In [25]:
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
LDAvis_prepared = pyLDAvis.gensim.prepare(hdp, corpus, id2word)
with open(LDAvis_data_filepath, 'wb') as f:
    pickle.dump(LDAvis_prepared, f)

  and should_run_async(code)
  doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]


ValidationError: 
 * Not all rows (distributions) in doc_topic_dists sum to 1.