In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

[nltk_data] Downloading package wordnet to /home/hanl9/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to /home/hanl9/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

In [5]:
import random
text_data = []
with open('heart_attack4.csv') as f:
    for line in f:
        tokens = prepare_text_for_lda(line)
        if random.random() > .99:
            print(tokens)
            text_data.append(tokens)

['medical', 'tooth', 'mount', 'sensor', 'monitoring', 'teeth', 'level', 'using', 'bacterial', 'optimization', 'along', 'adaptive', 'learning', 'neural', 'network', 'development', 'technology', 'create', 'change', 'processing', 'lead', 'alter', 'human', 'style', 'change', 'habit', 'attention', 'dietary', 'habit', 'increase', 'prevalence', 'disease', 'obesity', 'heart', 'attack', 'diabetic', 'hence', 'important', 'monitor', 'consumption', 'avoid', 'health', 'issue', 'associate', 'purpose', 'medical', 'tooth', 'place', 'teeth', 'filling', 'bonding', 'process', 'electrochemical', 'sensor', 'collect', 'information', 'consume', 'sugar', 'gather', 'information', 'analyze', 'ass', 'quality', 'intake', 'collect', 'information', 'process', 'using', 'bacterial', 'optimization', 'along', 'adaptive', 'learning', 'neural', 'network', 'examine', 'collect', 'information', 'using', 'learning', 'process', 'device', 'embed', 'teeth', 'reduce', 'difficulty', 'mastication', 'excellence', 'device', 'base', 

['novel', 'wearable', 'electrode', 'base', 'conductive', 'chitosan', 'fabric', 'application', 'smart', 'garment', 'smart', 'garment', 'capture', 'electrocardiogram', 'signal', 'location', 'alert', 'others', 'heart', 'attack', 'prevent', 'sudden', 'cardiac', 'death', 'people', 'sleeping', 'walking', 'running', 'novel', 'wearable', 'electrode', 'smart', 'garment', 'base', 'conductive', 'chitosan', 'fabric', 'fabricate', 'electroless', 'plating', 'silver', 'nanoparticles', 'surface', 'fiber', 'electrical', 'resistance', 'relate', 'silver', 'content', 'composite', 'fabric', '0.0332', '0.0041', 'strong', 'reactivity', 'amine', 'group', 'silver', 'washing', 'fabric', 'eight', 'times', 'electrical', 'resistance', 'remain', 'conductive', 'chitosan', 'fabric', 'apply', 'smart', 'garment', 'wearable', 'electrode', 'capture', 'electrocardiogram', 'signal', 'human', 'static', 'state', 'jogging', 'state', 'running', 'state', 'show', 'acquisition', 'ability', 'sensitivity', 'author']
['study', 'acqu

['efficacy', 'community', 'base', 'physical', 'activity', 'program', 'km2h2', 'stroke', 'heart', 'attack', 'prevention', 'among', 'senior', 'hypertensive', 'patient', 'cluster', 'randomize', 'control', 'phase', 'trial', 'objective', 'evaluate', 'efficacy', 'program', 'move', 'toward', 'healthy', 'heart', 'healthy', 'brain', 'km2h2', 'encourage', 'physical', 'activity', 'prevention', 'heart', 'attack', 'stroke', 'among', 'hypertensive', 'patient', 'enrol', 'community', 'base', 'hypertension', 'control', 'program', 'cbhcp', 'design', 'cluster', 'randomize', 'control', 'trial', 'three', 'wave', 'longitudinal', 'assessment', 'baseline', 'month', 'intervention', 'setting', 'community', 'base', 'patient', 'center', 'behavioral', 'intervention', 'urban', 'setting', 'china', 'participant', 'total', 'participant', 'diagnose', 'hypertension', 'community', 'health', 'center', 'wuhan', 'china', 'recruit', 'randomly', 'assign', 'center', 'receive', 'either', 'km2h2', 'standard', 'cbhcp', 'center', 

In [6]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)

In [7]:
corpus = [dictionary.doc2bow(text) for text in text_data]

In [8]:
import pickle
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [9]:

import gensim
NUM_TOPICS = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')

In [10]:

topics = ldamodel.print_topics(num_words=4)
for topic in topics:
    print(topic)

(0, '0.017*"model" + 0.015*"heart" + 0.015*"intervention" + 0.014*"attack"')
(1, '0.016*"million" + 0.014*"hearts" + 0.014*"american" + 0.014*"cardiovascular"')
(2, '0.017*"health" + 0.017*"doctor" + 0.015*"coordination" + 0.013*"among"')
(3, '0.018*"search" + 0.013*"promotion" + 0.011*"aspirin" + 0.011*"solar"')
(4, '0.024*"evidence" + 0.023*"system" + 0.020*"group" + 0.019*"study"')


In [11]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=False)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [12]:
from gensim.models import CoherenceModel


In [13]:
print('\nPerplexity: ', ldamodel.log_perplexity(corpus))  


Perplexity:  -6.4665916928298905
