# Websites that I basically just copied/followed walkthrough from:
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [2]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from nltk.corpus import stopwords
import traitlets
from IPython.display import display
from ipywidgets import widgets
from tkinter import Tk, filedialog
import en_core_web_sm
nlp = en_core_web_sm.load()

In [3]:
class SelectFilesButton(widgets.Button):
    """A file widget that leverages tkinter.filedialog."""

    def __init__(self, *args, **kwargs):
        """Initialize the SelectFilesButton class."""
        super(SelectFilesButton, self).__init__(*args, **kwargs)
        # Add the selected_files trait
        self.add_traits(files=traitlets.traitlets.List())
        # Create the button.
        self.description = "Select Files"
        self.icon = "square-o"
        self.style.button_color = "orange"
        # Set on click behavior.
        self.on_click(self.select_files)

    @staticmethod
    def select_files(b):
        """Generate instance of tkinter.filedialog.
        Parameters
        ----------
        b : obj:
            An instance of ipywidgets.widgets.Button
        """
        # Create Tk root
        root = Tk()
        # Hide the main window
        root.withdraw()
        # Raise the root to the top of all windows.
        root.call('wm', 'attributes', '.', '-topmost', True)
        # List of selected fileswill be set to b.value
        b.files = filedialog.askopenfilename(multiple=True)

        b.description = "Files Selected"
        b.icon = "check-square-o"
        b.style.button_color = "lightgreen"
my_button = SelectFilesButton()
my_button

SelectFilesButton(description='Select Files', icon='square-o', style=ButtonStyle(button_color='orange'))

In [4]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [5]:
sentence_list = pd.read_csv(my_button.files[0])['Input.text'].tolist()

In [6]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(sentence_list))

In [7]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [8]:
trigram[data_words[125]]

['can_seem',
 'to',
 'get',
 'caught',
 'up',
 'on',
 'anything',
 'around',
 'the',
 'house']

In [9]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['bumper', 'bumper', 'traffic', 'drive', 'nut', 'ever', 'move']]


In [11]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1)]]


In [12]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('bumper', 2),
  ('drive', 1),
  ('ever', 1),
  ('move', 1),
  ('nut', 1),
  ('traffic', 1)]]

In [13]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=9, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [14]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.426*"work" + 0.052*"go" + 0.036*"school" + 0.033*"try" + 0.020*"start" + '
  '0.019*"thing" + 0.017*"kid" + 0.016*"seem" + 0.016*"give" + 0.015*"be"'),
 (1,
  '0.209*"stress" + 0.080*"time" + 0.033*"find" + 0.032*"crazy" + '
  '0.025*"worry" + 0.024*"recent" + 0.020*"today" + 0.019*"take" + 0.019*"may" '
  '+ 0.018*"husband"'),
 (2,
  '0.116*"money" + 0.092*"boss" + 0.078*"stressful" + 0.062*"much" + '
  '0.038*"week" + 0.032*"deadline" + 0.031*"family" + 0.028*"stressed" + '
  '0.027*"life" + 0.026*"hour"'),
 (3,
  '0.160*"job" + 0.076*"lately" + 0.044*"worried" + 0.044*"make" + '
  '0.038*"situation" + 0.037*"day" + 0.028*"busy" + 0.024*"big" + 0.019*"help" '
  '+ 0.017*"enough"'),
 (4,
  '0.141*"get" + 0.087*"really" + 0.034*"want" + 0.033*"recently" + '
  '0.032*"right" + 0.031*"keep" + 0.025*"stuff" + 0.024*"new_job" + '
  '0.019*"break" + 0.017*"talk"'),
 (5,
  '0.040*"bad" + 0.032*"coworker" + 0.031*"buy" + 0.028*"year" + '
  '0.026*"figure" + 0.025*"enough_time" + 0.0

In [15]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -7.076014654506841

Coherence Score:  0.6182452580128681


In [16]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [33]:
def do_it_all(sentence_list):
    data_words = list(sent_to_words(sentence_list))
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
        # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=9, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    perplexity = lda_model.log_perplexity(corpus)
    return coherence_lda, perplexity

In [24]:
data_frame[:0]

Unnamed: 0,Input.text,is_stressor,is_stressor_conf,top_label,second_label,avg_severity,median_severity,SD_severity,Votes,Other,Everyday Decision Making,Work,Social Relationships,Financial Problem,"Health, Fatigue, or Physical Pain",Emotional Turmoil,Family Issues,School,Source


In [28]:
data_frame = pd.read_csv(my_button.files[0])
sentence_dict = {}
for i, row in data_frame.iterrows():
    if row['top_label'] not in sentence_dict:
        sentence_dict[row['top_label']] = []
    sentence_dict[row['top_label']].append(row['Input.text'])
sentence_dict

{'Everyday Decision Making': ['this bumper to bumper traffic is driving me nuts. will it ever move?',
  'packing for my trips',
  'getting dinner ready for christmas was an incredible challenge!',
  'well, you know how busy the christmas season has been... so much to do!!',
  'i just burned dinner and i have no time to make another.',
  'not having enough time in my day to get everything done and just being forgetful.',
  'literally trying to figure out my life in like three weeks lol',
  'hi friend. give me some idea preparing a meal. this makes me streeful in deciding. give',
  'rushed for time and frustrated trying to fix dinner meal',
  "i'm not sure what i can make other than pasta for dinner",
  "the washing machine is broken and i'm really behind on laundry.",
  "i can't decide what to cook",
  'i dont know what i want for dinner!',
  'having trouble choosing meals for the week=getting everyone to agree!',
  "i can't figure out what to cook for dinner tonight",
  'i had a stress

In [34]:
perp_coher_dict = {}
for key in sentence_dict:
    print(key)
    perplexity, coherence = do_it_all(sentence_dict[key])
    print(perplexity)
    print(coherence)
    perp_coher_dict[key] = (perplexity, coherence)

Everyday Decision Making
0.5174985290681113
-6.692375377518451
Work
0.621771529270715
-6.362156521523999
Other
0.6288803228187319
-6.871340214920805
Financial Problem
0.5560582210030645
-6.019651740617766
Health, Fatigue, or Physical Pain
0.5232453446318873
-6.411790882606828
Emotional Turmoil
0.4562448301898699
-6.121067112292385
Family Issues
0.5621992013745302
-6.5610873569118775
Social Relationships
0.5477731193303987
-6.4972188039317
School
0.5759366775501313
-5.764681655714909


In [35]:
perp_coher_dict

{'Everyday Decision Making': (0.5174985290681113, -6.692375377518451),
 'Work': (0.621771529270715, -6.362156521523999),
 'Other': (0.6288803228187319, -6.871340214920805),
 'Financial Problem': (0.5560582210030645, -6.019651740617766),
 'Health, Fatigue, or Physical Pain': (0.5232453446318873, -6.411790882606828),
 'Emotional Turmoil': (0.4562448301898699, -6.121067112292385),
 'Family Issues': (0.5621992013745302, -6.5610873569118775),
 'Social Relationships': (0.5477731193303987, -6.4972188039317),
 'School': (0.5759366775501313, -5.764681655714909)}