In [1]:
import pandas as pd
import gensim
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string
from gensim import corpora
import pickle
import pyLDAvis.gensim

# Loading the Data

In [2]:
text_data = pd.read_csv('Topic_dataset.csv')
doc_complete = []
for i in range(0,len(text_data)):
    doc_complete.append(text_data.iloc[i,0])
doc_complete    

['High performance prime field multiplication for GPU.',
 'enchanted scissors: a scissor interface for support in cutting and interactive fabrication.',
 'Detection of channel degradation attack by Intermediary Node in Linear Networks.',
 'Pinning a Complex Network through the Betweenness Centrality Strategy.',
 'Analysis and Design of Memoryless Interconnect Encoding Scheme.',
 'Dynamic bluescreens.',
 'A Quantitative Assured Forwarding Service.',
 'Automatic sanitization of social network data to prevent inference attacks.',
 'A &#916;&#931; IR-UWB radar with sub-mm ranging capability for human body monitoring systems.',
 'Architecture of a multi-slot main memory system for 3.2 Gbps operation.',
 'Rule-based Service Customization via Houdini.',
 'Business Policy Modeling and Enforcement in Databases.',
 'A high speed and high linearity OTA in 1-V power supply voltage.',
 'PREDIcT: Towards Predicting the Runtime of Large Scale Iterative Analytics.',
 'SocialSensor: sensing user genera

In [3]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized
doc_clean = [clean(doc).split() for doc in doc_complete]   

# LDA with Gensim

In [4]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
corpus = [dictionary.doc2bow(doc) for doc in doc_clean]

In [5]:
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

# LDA to find 5 topics in the data

In [6]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.020*"system" + 0.012*"based" + 0.011*"using" + 0.008*"design"')
(1, '0.012*"web" + 0.009*"data" + 0.008*"database" + 0.007*"approach"')
(2, '0.028*"network" + 0.024*"data" + 0.019*"web" + 0.014*"wireless"')
(3, '0.018*"network" + 0.014*"wireless" + 0.012*"sensor" + 0.011*"using"')
(4, '0.016*"search" + 0.012*"design" + 0.010*"algorithm" + 0.009*"web"')


# Visualizing 5 topics

__Saliency:__ a measure of how much the term tells you about the topic.
    
__Relevance:__  a weighted average of the probability of the word given the topic and the word given
the topic normalized by the probability of the topic.

__The size of the bubble measures the importance of the topics, relative to the data.__

In [7]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# LDA to find 3 topics in the data

In [8]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.034*"network" + 0.015*"wireless" + 0.011*"sensor" + 0.011*"system"')
(1, '0.010*"query" + 0.008*"using" + 0.008*"system" + 0.008*"database"')
(2, '0.026*"web" + 0.020*"data" + 0.011*"system" + 0.010*"service"')


# Visualizing 3 topics

In [9]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


# LDA to find 10 topics in the data

In [10]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.027*"data" + 0.025*"database" + 0.017*"system" + 0.013*"graph"')
(1, '0.060*"network" + 0.022*"wireless" + 0.018*"data" + 0.016*"sensor"')
(2, '0.034*"network" + 0.024*"control" + 0.020*"wireless" + 0.009*"bandwidth"')
(3, '0.023*"algorithm" + 0.015*"data" + 0.014*"query" + 0.011*"using"')
(4, '0.015*"architecture" + 0.010*"application" + 0.009*"sigmadelta" + 0.009*"noise"')
(5, '0.022*"application" + 0.019*"power" + 0.016*"design" + 0.016*"sensor"')
(6, '0.023*"system" + 0.019*"query" + 0.013*"image" + 0.011*"data"')
(7, '0.016*"filter" + 0.013*"web" + 0.010*"design" + 0.010*"using"')
(8, '0.049*"web" + 0.021*"system" + 0.013*"service" + 0.009*"network"')
(9, '0.015*"new" + 0.014*"using" + 0.013*"adaptive" + 0.011*"topology"')


# Visualizing 10 topics

In [11]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
