# Websites that I basically just copied/followed walkthrough from:
https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import LsiModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)
from nltk.corpus import stopwords
import traitlets
from IPython.display import display
from ipywidgets import widgets
from tkinter import Tk, filedialog
import en_core_web_sm
nlp = en_core_web_sm.load()

In [2]:
class SelectFilesButton(widgets.Button):
    """A file widget that leverages tkinter.filedialog."""

    def __init__(self, *args, **kwargs):
        """Initialize the SelectFilesButton class."""
        super(SelectFilesButton, self).__init__(*args, **kwargs)
        # Add the selected_files trait
        self.add_traits(files=traitlets.traitlets.List())
        # Create the button.
        self.description = "Select Files"
        self.icon = "square-o"
        self.style.button_color = "orange"
        # Set on click behavior.
        self.on_click(self.select_files)

    @staticmethod
    def select_files(b):
        """Generate instance of tkinter.filedialog.
        Parameters
        ----------
        b : obj:
            An instance of ipywidgets.widgets.Button
        """
        # Create Tk root
        root = Tk()
        # Hide the main window
        root.withdraw()
        # Raise the root to the top of all windows.
        root.call('wm', 'attributes', '.', '-topmost', True)
        # List of selected fileswill be set to b.value
        b.files = filedialog.askopenfilename(multiple=True)

        b.description = "Files Selected"
        b.icon = "check-square-o"
        b.style.button_color = "lightgreen"
my_button = SelectFilesButton()
my_button

SelectFilesButton(description='Select Files', icon='square-o', style=ButtonStyle(button_color='orange'))

In [3]:
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [4]:
data_frame = pd.read_csv(my_button.files[0])
sentence_dict = {}
for i, row in data_frame.iterrows():
    if row['top_label'] not in sentence_dict:
        sentence_dict[row['top_label']] = []
    sentence_dict[row['top_label']].append(row['Input.text'])

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [11]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [17]:
def do_it_all(sentence_list):
    data_words = list(sent_to_words(sentence_list))
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=10) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=10)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    
        # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=1, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    perplexity = lda_model.log_perplexity(corpus)
    return coherence_lda, perplexity

In [19]:
perp_coher_dict = {}
for key in sentence_dict:
    perplexity, coherence = do_it_all(sentence_dict[key])
    perp_coher_dict[key] = (perplexity, coherence)

In [20]:
perp_coher_dict

{'Everyday Decision Making': (0.6746817714435754, -5.7390281318703575),
 'Work': (0.4910513589427044, -5.78980511883024),
 'Other': (0.6646450695488907, -6.1177688476188194),
 'Financial Problem': (0.4249728412042283, -5.377070625782013),
 'Health, Fatigue, or Physical Pain': (0.6063830948312298, -5.606957616768484),
 'Emotional Turmoil': (0.6733387787741211, -5.311907352650002),
 'Family Issues': (0.521829905760556, -5.8794804003364165),
 'Social Relationships': (0.6661313537669303, -5.611232774202214),
 'School': (0.5562748114676086, -5.001007715600436)}

### Below are the original numbers I sent you, but the bigrams were incorrect because the bigrams were based off of the whole set and not just the set of the sentences themselves. The numbers from the dictionary above are doing what the function should do. 

In [35]:
perp_coher_dict

{'Everyday Decision Making': (0.5174985290681113, -6.692375377518451),
 'Work': (0.621771529270715, -6.362156521523999),
 'Other': (0.6288803228187319, -6.871340214920805),
 'Financial Problem': (0.5560582210030645, -6.019651740617766),
 'Health, Fatigue, or Physical Pain': (0.5232453446318873, -6.411790882606828),
 'Emotional Turmoil': (0.4562448301898699, -6.121067112292385),
 'Family Issues': (0.5621992013745302, -6.5610873569118775),
 'Social Relationships': (0.5477731193303987, -6.4972188039317),
 'School': (0.5759366775501313, -5.764681655714909)}