In [1]:
import spacy
from spacy.lang.en import English
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import random
from gensim import corpora
import pickle
import gensim
import pyLDAvis.gensim
import warnings

warnings.filterwarnings('ignore')
parser = English()

In [2]:
# nltk.download('wordnet')
# nltk.download('stopwords')
# spacy.load('en')


In [3]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [4]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [5]:
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [6]:
en_stop = set(nltk.corpus.stopwords.words('english'))


In [7]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [8]:
text_data = []
with open('data/dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['rigorous', 'integration', 'piece', 'linear', 'continuous', 'system']
['challenge', 'recent', 'advance', 'system', 'design']
['efficient', 'information', 'collection', 'protocol', 'large', 'scale', 'system']
['jitter', 'clock', 'synthesizer', 'locking', 'range']
['sup>2</sup', 'parallel', 'pipelined', 'efficient', 'architecture', 'discrete', 'wavelet', 'transform']
['generic', 'sensing', 'hardware', 'reconstruction', 'structure', 'analog', 'signal']
['power', 'filter', 'wireless', 'receiver', 'application', 'automatic', 'tuning', 'system']
['scalable', 'query', 'services', 'fuzzy', 'ontology']
['b+-tree', 'index', 'optimization', 'exploit', 'internal', 'parallelism', 'flash', 'base', 'solid', 'state', 'drive']
['transaction', 'base', 'application', 'error', 'recovery', 'point', 'query']
['secure', 'pseudo', 'random', 'sequence', 'generation', 'using', 'couple', 'linear', 'congruential', 'generator']
['manage', 'semantic', 'heterogeneity', 'production', 'rule', 'persistent', 'queue']
[

In [9]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [10]:
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.027*"base" + 0.027*"network" + 0.015*"design" + 0.015*"system"')
(1, '0.025*"base" + 0.025*"using" + 0.025*"policy" + 0.014*"algorithm"')
(2, '0.029*"system" + 0.016*"application" + 0.016*"filter" + 0.016*"wireless"')
(3, '0.014*"hardware" + 0.014*"reconstruction" + 0.014*"generic" + 0.014*"electronic"')
(4, '0.042*"efficient" + 0.029*"system" + 0.016*"query" + 0.016*"sup>2</sup"')


In [11]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(54, 1), (114, 1), (127, 1)]
[(0, 0.4325488), (1, 0.41683236), (2, 0.050020725), (3, 0.05057791), (4, 0.050020214)]


In [12]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.027*"system" + 0.021*"base" + 0.021*"using" + 0.015*"algorithm"')
(1, '0.026*"system" + 0.019*"large" + 0.019*"network" + 0.019*"efficient"')
(2, '0.011*"generation" + 0.011*"linear" + 0.011*"query" + 0.011*"object"')


In [13]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.034*"base" + 0.018*"algorithm" + 0.018*"exploit" + 0.018*"parallelism"')
(1, '0.049*"using" + 0.026*"active" + 0.026*"oximetry" + 0.026*"pulse"')
(2, '0.005*"method" + 0.005*"system" + 0.005*"spatial" + 0.005*"phishing"')
(3, '0.022*"algorithm" + 0.022*"query" + 0.022*"recovery" + 0.022*"point"')
(4, '0.046*"large" + 0.046*"network" + 0.024*"efficient" + 0.024*"scale"')
(5, '0.047*"system" + 0.032*"base" + 0.032*"design" + 0.017*"control"')
(6, '0.036*"generation" + 0.036*"using" + 0.036*"linear" + 0.036*"secure"')
(7, '0.061*"policy" + 0.032*"system" + 0.032*"application" + 0.032*"filter"')
(8, '0.045*"socialfilter" + 0.045*"mitigation" + 0.045*"trust" + 0.045*"introduce"')
(9, '0.037*"system" + 0.037*"piece" + 0.037*"rigorous" + 0.037*"continuous"')


In [14]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

In [15]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

In [16]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)