In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /home/hanl9/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/hanl9/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [6]:
import random
text_data = []
with open('cloud_computing2.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['crosstalk', 'aware', 'route', 'spectrum', 'assignment', 'using', 'partial', 'equalization', 'flexible', 'datacenter', 'network', 'recently', 'datacenters', 'build', 'enhance', 'reliability', 'expand', 'processing', 'ability', 'computing', 'service', 'cloud', 'computing', 'furthermore', 'spatial', 'division', 'multiplexing', 'technology', 'elastic', 'optical', 'network', 'consider', 'candidate', 'solution', 'solve', 'requirement', 'bandwidth', 'flexible', 'connection', 'however', 'difficult', 'fiber', 'base', 'exist', 'modal', 'crosstalk', 'among', 'different', 'mode', 'face', 'challenge', 'propose', 'route', 'spectrum', 'assignment', 'algorithm', 'decrease', 'blocking', 'probability', 'increase', 'spectrum', 'utilization', 'problem', 'efficiently', 'propose', 'method', 'call', 'flexible', 'slide', 'three', 'dimensional', 'block', 'block', 'represent', 'dimension', 'spectrum', 'space', 'algorithm', 'block', 'slide', 'resource', 'space', 'proper', 'resource', 'simulation', 'conduct', '

['toward', 'ultra', 'power', 'remote', 'health', 'monitoring', 'optimal', 'adaptive', 'compress', 'sensing', 'framework', 'activity', 'recognition', 'activity', 'recognition', 'important', 'component', 'behavioral', 'monitoring', 'intervention', 'attract', 'enormous', 'attention', 'especially', 'mobile', 'cloud', 'computing', 'remote', 'health', 'monitoring', 'paradigm', 'recently', 'resource', 'constrain', 'wearable', 'devices', 'gain', 'popularity', 'battery', 'limited', 'constrain', 'frequent', 'wireless', 'transmission', 'computationally', 'powerful', 'paper', 'propose', 'ultra', 'power', 'activity', 'recognition', 'system', 'using', 'novel', 'adaptive', 'compress', 'sensing', 'technique', 'minimize', 'transmission', 'costs', 'coarse', 'grain', 'sensor', 'localization', 'unsupervised', 'clustering', 'module', 'devise', 'autonomously', 'reconfigure', 'compress', 'sensing', 'module', 'power', 'saving', 'perform', 'thorough', 'heuristic', 'optimization', 'using', 'grammatical', 'evolu

['federate', 'hybrid', 'cloud', 'service', 'level', 'agreement', 'legal', 'issue', 'cloud', 'computing', 'adapt', 'various', 'sector', 'diversify', 'benefit', 'terms', 'application', 'services', 'different', 'cloud', 'delivery', 'model', 'architecture', 'performance', 'customize', 'offering', 'cloud', 'vendor', 'bound', 'provide', 'quality', 'services', 'cloud', 'tenant', 'promise', 'service', 'level', 'agreement', 'mutually', 'agree', 'party', 'despite', 'contract', 'range', 'legal', 'issue', 'surface', 'majority', 'exist', 'cloud', 'tenant', 'understand', 'federate', 'cloud', 'functionality', 'exclusion', 'newly', 'form', 'federate', 'cloud', 'model', 'vendor', 'standardization', 'performance', 'reliability', 'trust', 'author', 'highlight', 'various', 'legal', 'base', 'issue', 'example', 'federate', 'hybrid', 'multi', 'cloud', 'ecosystem', 'cloud', 'tenant', 'vendor', 'operational', 'complexity', 'security', 'breach', 'springer', 'nature', 'singapore']


In [8]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [9]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [10]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [11]:

import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [12]:

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.022*"cloud" + 0.022*"computing" + 0.019*"spectrum" + 0.013*"technology"')
(1, '0.023*"drone" + 0.017*"system" + 0.017*"application" + 0.013*"base"')
(2, '0.022*"system" + 0.017*"cloud" + 0.013*"model" + 0.012*"information"')
(3, '0.019*"power" + 0.015*"sensing" + 0.015*"percent" + 0.015*"recognition"')
(4, '0.039*"cloud" + 0.017*"computing" + 0.017*"level" + 0.014*"propose"')


In [13]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [14]:
from gensim.models import CoherenceModel


In [15]:
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  

IndexError: index 759 is out of bounds for axis 1 with size 759