## Topic Modeling (LDA) ##

### LDA :- Latent Dirichlet Allocation Model/Algorithm ###

#### By Vedant Lotia ####

In [1]:
import re
import pandas as pd
import genesis
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis



In [2]:
papers = pd.read_csv('C:\\Users\\HP\\Downloads\\papers\\papers.csv')
papers.head(10)

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,100,1988,Storing Covariance by the Associative Long-Ter...,,100-storing-covariance-by-the-associative-long...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1000,1994,Bayesian Query Construction for Neural Network...,,1000-bayesian-query-construction-for-neural-ne...,Abstract Missing,Bayesian Query Construction for Neural\nNetwor...
4,1001,1994,"Neural Network Ensembles, Cross Validation, an...",,1001-neural-network-ensembles-cross-validation...,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, a..."
5,1002,1994,Using a neural net to instantiate a deformable...,,1002-using-a-neural-net-to-instantiate-a-defor...,Abstract Missing,U sing a neural net to instantiate a\ndeformab...
6,1003,1994,Plasticity-Mediated Competitive Learning,,1003-plasticity-mediated-competitive-learning.pdf,Abstract Missing,Plasticity-Mediated Competitive Learning\n\nTe...
7,1004,1994,ICEG Morphology Classification using an Analog...,,1004-iceg-morphology-classification-using-an-a...,Abstract Missing,ICEG Morphology Classification using an\nAnalo...
8,1005,1994,Real-Time Control of a Tokamak Plasma Using Ne...,,1005-real-time-control-of-a-tokamak-plasma-usi...,Abstract Missing,Real-Time Control of a Tokamak Plasma\nUsing N...
9,1006,1994,Pulsestream Synapses with Non-Volatile Analogu...,,1006-pulsestream-synapses-with-non-volatile-an...,Abstract Missing,Real-Time Control of a Tokamak Plasma\nUsing N...


In [3]:
papers = papers.drop(columns=['id', 'event_type', 'pdf_name'], axis=1).sample(100)

# Print out the first rows of papers
papers.head()

Unnamed: 0,year,title,abstract,paper_text
635,1998,Classification in Non-Metric Spaces,Abstract Missing,Classification in Non-Metric Spaces\n\nDaphna ...
869,2000,A Gradient-Based Boosting Algorithm for Regres...,Abstract Missing,A Gradient-Based Boosting Algorithm for\nRegre...
2290,2006,TrueSkill?: A Bayesian Skill Rating System,Abstract Missing,TrueSkill\n\nTM :\n\nA Bayesian Skill Rating S...
2438,2007,Using Deep Belief Nets to Learn Covariance Ker...,We show how to use unlabeled data and a deep b...,Using Deep Belief Nets to Learn Covariance Ker...
442,1997,The Rectified Gaussian Distribution,Abstract Missing,The Rectified Gaussian Distribution\nN. D. Soc...


In [11]:


# Remove punctuation
papers['paper_text_processed'] = \
papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x))

# Convert the titles to lowercase
papers['paper_text_processed'] = \
papers['paper_text_processed'].map(lambda x: x.lower())

# Print out the first rows of papers
papers['paper_text_processed'].head()

  papers['paper_text'].map(lambda x: re.sub('[,\.!?]', '', x))


635     classification in non-metric spaces\n\ndaphna ...
869     a gradient-based boosting algorithm for\nregre...
2290    trueskill\n\ntm :\n\na bayesian skill rating s...
2438    using deep belief nets to learn covariance ker...
442     the rectified gaussian distribution\nn d socci...
Name: paper_text_processed, dtype: object

In [12]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in texts]


data = papers.paper_text_processed.values.tolist()
data_words = list(sent_to_words(data))

# remove stop words
data_words = remove_stopwords(data_words)

print(data_words[:1][0][:30])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['classification', 'non', 'metric', 'spaces', 'daphna', 'weinshall', 'david', 'jacobs', 'yoram', 'gdalyahu', 'nec', 'research', 'institute', 'independence', 'way', 'princeton', 'nj', 'usa', 'inst', 'computer', 'science', 'hebrew', 'university', 'jerusalem', 'jerusalem', 'israel', 'abstract', 'key', 'question', 'vision']


In [13]:
import gensim.corpora as corpora

# Create Dictionary
id2word = corpora.Dictionary(data_words)

# Create Corpus
texts = data_words

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 4), (3, 1), (4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 6), (11, 11), (12, 1), (13, 1), (14, 4), (15, 1), (16, 1), (17, 2), (18, 2), (19, 1), (20, 1), (21, 1), (22, 6), (23, 1), (24, 2), (25, 2), (26, 4), (27, 1), (28, 1), (29, 1)]


In [14]:
from pprint import pprint

# number of topics
num_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)

# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.007*"model" + 0.005*"time" + 0.004*"algorithm" + 0.004*"using" + '
  '0.004*"data" + 0.004*"learning" + 0.004*"function" + 0.003*"number" + '
  '0.003*"set" + 0.003*"state"'),
 (1,
  '0.008*"algorithm" + 0.006*"learning" + 0.005*"function" + 0.005*"model" + '
  '0.004*"set" + 0.004*"data" + 0.004*"time" + 0.004*"one" + 0.004*"problem" + '
  '0.004*"distribution"'),
 (2,
  '0.006*"learning" + 0.006*"algorithm" + 0.005*"time" + 0.005*"set" + '
  '0.005*"data" + 0.005*"model" + 0.004*"function" + 0.004*"one" + '
  '0.004*"using" + 0.004*"two"'),
 (3,
  '0.006*"model" + 0.005*"learning" + 0.004*"algorithm" + 0.004*"data" + '
  '0.004*"time" + 0.004*"set" + 0.004*"function" + 0.004*"number" + '
  '0.004*"state" + 0.003*"using"'),
 (4,
  '0.005*"model" + 0.005*"algorithm" + 0.005*"learning" + 0.004*"set" + '
  '0.004*"function" + 0.004*"data" + 0.004*"one" + 0.004*"problem" + '
  '0.004*"time" + 0.003*"matrix"'),
 (5,
  '0.006*"data" + 0.006*"model" + 0.006*"algorithm" + 0.005*"lea

In [15]:
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, dictionary=lda_model.id2word)
vis

  default_term_info = default_term_info.sort_values(
