In [7]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()

def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [8]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rothschild/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [4]:
for w in ['dogs','ran','discouraged']:
    print(w, get_lemma(w), get_lemma2(w))

dogs dog dog
ran run ran
discouraged discourage discouraged


In [5]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rothschild\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [14]:
import random
text_data = []
with open('Data\dataset.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['scalable', 'algorithm', 'maximize', 'range', 'spatial', 'database']
['multiuser', 'detection', 'base', 'grover', 'algorithm']
['efficient', 'implementation', 'relational']
['scalable', 'secret', 'generation', 'exploit', 'channel', 'phase', 'randomness', 'wireless', 'network']
['novel', 'feature', 'extraction', 'algorithm', 'classification', 'flight', 'call']
['similarity']
['analysis', 'design', 'lump', 'element', 'quadrature', 'coupler', 'lossy', 'passive', 'elements']
['potential', 'structure', 'aggregation', 'sensor', 'network']
['object', 'database', 'morphology']
['network', 'correlate', 'gathering']
['economic', 'effect', 'prefix', 'deaggregation']
['performance', 'cavlc', 'encoder', 'design', 'mpeg-4', 'h.264', 'video', 'coding', 'application']
['overview', 'management', 'platform', 'digital', 'advertising']
['mapping', 'optimize', 'conjunction', 'base', 'scalable', 'transcoder']
['universal', 'embed', 'compression', 'engine', 'system', 'expansion', 'progressive', 'wavelet', '

In [15]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [16]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [17]:
import pickle
pickle.dump(corpus, open('corpus.pkl','wb'))
dictionary.save('dictionary.gensim')

## Try 5 topics

In [18]:
import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [19]:
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.039*"scalable" + 0.039*"network" + 0.021*"design" + 0.021*"coupler"')
(1, '0.062*"algorithm" + 0.042*"base" + 0.023*"performance" + 0.023*"analysis"')
(2, '0.039*"management" + 0.039*"digital" + 0.039*"overview" + 0.039*"advertising"')
(3, '0.039*"search" + 0.039*"system" + 0.039*"coding" + 0.039*"progressive"')
(4, '0.026*"database" + 0.026*"spatial" + 0.026*"scalable" + 0.026*"supporting"')


In [20]:
new_doc = 'Practical Bayesian Optimization of Machine Learning Algorithms'
new_doc = prepare_text_for_lda(new_doc)
new_doc_bow = dictionary.doc2bow(new_doc)
print(new_doc_bow)
print(ldamodel.get_document_topics(new_doc_bow))

[(0, 1)]
[(0, 0.10000595), (1, 0.59888273), (2, 0.10001081), (3, 0.10000595), (4, 0.10109453)]


In [21]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 3, id2word=dictionary, passes=15)
ldamodel.save('model3.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.030*"system" + 0.030*"scalable" + 0.030*"spatial" + 0.030*"network"')
(1, '0.037*"algorithm" + 0.026*"base" + 0.026*"design" + 0.026*"analysis"')
(2, '0.044*"search" + 0.031*"scalable" + 0.018*"management" + 0.018*"network"')


In [22]:
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=15)
ldamodel.save('model10.gensim')
topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.054*"using" + 0.028*"application" + 0.028*"video" + 0.028*"cavlc"')
(1, '0.058*"molecule" + 0.058*"mining" + 0.058*"information" + 0.058*"chemical"')
(2, '0.034*"progressive" + 0.034*"system" + 0.034*"coding" + 0.034*"compression"')
(3, '0.034*"algorithm" + 0.034*"performance" + 0.034*"base" + 0.034*"optical"')
(4, '0.091*"search" + 0.048*"multi" + 0.048*"result" + 0.048*"interactive"')
(5, '0.078*"management" + 0.041*"database" + 0.041*"spatial" + 0.041*"algorithm"')
(6, '0.079*"network" + 0.079*"correlate" + 0.079*"gathering" + 0.007*"transcoder"')
(7, '0.041*"algorithm" + 0.041*"design" + 0.041*"analysis" + 0.041*"passive"')
(8, '0.055*"scalable" + 0.055*"network" + 0.055*"secret" + 0.055*"channel"')
(9, '0.009*"similarity" + 0.009*"scalable" + 0.009*"network" + 0.009*"database"')


## pyLDAvis

In [23]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')

In [24]:
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [25]:
lda3 = gensim.models.ldamodel.LdaModel.load('model3.gensim')
lda_display3 = pyLDAvis.gensim.prepare(lda3, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [26]:
lda10 = gensim.models.ldamodel.LdaModel.load('model10.gensim')
lda_display10 = pyLDAvis.gensim.prepare(lda10, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
