In [1]:
import re
import numpy as np

In [2]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel



In [3]:
# for plotting
import pyLDAvis

In [5]:
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt

In [8]:
# prep NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

In [9]:
import pandas as pd
email_df = pd.read_csv ('data.csv')

In [10]:
# Convert email body to list
data = email_df.body.values.tolist()

In [11]:
# tokenize - break down each sentence into a list of words
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [12]:
data_words = list(sent_to_words(data))

In [13]:
data_words[0:2]

[['plan',
  'to',
  'attend',
  'the',
  'quarterly',
  'managing',
  'director',
  'meeting',
  'scheduled',
  'for',
  'monday',
  'october',
  'an',
  'agenda',
  'will',
  'be',
  'distributed',
  'at',
  'the',
  'meeting',
  'monday',
  'october',
  'hyatt',
  'regencydogwood',
  'room',
  'located',
  'on',
  'the',
  'rd',
  'floor',
  'video',
  'connection',
  'will',
  'be',
  'made',
  'from',
  'the',
  'london',
  'office',
  'please',
  'call',
  'if',
  'you',
  'have',
  'any',
  'questions',
  'joannie'],
 ['for',
  'is',
  'drop',
  'down',
  'box',
  'to',
  'storage',
  'in',
  'house',
  'analysismy',
  'files',
  'are',
  'the',
  'last',
  'three',
  'files',
  'when',
  'the',
  'dialog',
  'box',
  'asks',
  'to',
  'upadate',
  'links',
  'click',
  'no']]

In [14]:
from gensim.models.phrases import Phrases, Phraser


In [15]:
# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], threshold=100)

In [16]:
# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

In [17]:
# See trigram example
print(trigram_mod[bigram_mod[data_words[200]]])

['tax', 'are', 'continuing', 'work', 'to', 'ensure', 'our', 'new', 'online', 'trading', 'system', 'ubswenergy', 'is', 'ready', 'for', 'launch', 'on', 'the', 'week', 'of', 'feb', 'which', 'is', 'the', 'week', 'following', 'the', 'anticipated', 'enron', 'ubs_closing', 'date', 'of', 'friday', 'feb', 'to', 'assist', 'in', 'this', 'process', 'we', 'are', 'planning', 'an', 'online', 'trading', 'simulation', 'for', 'thursday', 'february', 'from', 'to', 'cst', 'it', 'is', 'planned', 'to', 'include', 'all', 'traders_originators', 'mid', 'back', 'office', 'staff', 'and', 'it', 'teams', 'who', 'would', 'normally', 'be', 'involved', 'with', 'electronic', 'trading', 'test', 'the', 'integrity', 'and', 'functionality', 'of', 'the', 'system', 'and', 'related_processes', 'including', 'credit', 'risk', 'legal', 'operations', 'etc', 'provide', 'an', 'opportunity', 'for', 'traders', 'to', 'verify', 'their', 'products', 'and', 'product', 'how', 'the', 'trading', 'system', 'and', 'processes', 'work', 'to', 

In [18]:
# remove stop_words, make bigrams and lemmatize
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]



In [19]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [20]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [21]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kaustubh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [24]:
pos_tagged_data_words_bigrams = []
for email in data_words_bigrams:
    pos_tagged_data_words_bigrams.append(nltk.pos_tag(email))

In [25]:
pos_tagged_data_words_bigrams

[[('plan', 'NN'),
  ('attend', 'VBP'),
  ('quarterly_managing', 'VBG'),
  ('director', 'NN'),
  ('meeting', 'NN'),
  ('scheduled', 'VBN'),
  ('monday', 'JJ'),
  ('october', 'NNP'),
  ('agenda', 'NN'),
  ('distributed', 'VBD'),
  ('meeting', 'NN'),
  ('monday', 'JJ'),
  ('october', 'NNP'),
  ('hyatt_regencydogwood', 'NN'),
  ('room', 'NN'),
  ('located', 'VBN'),
  ('rd_floor', 'JJ'),
  ('video_connection', 'NN'),
  ('made', 'VBD'),
  ('london', 'JJ'),
  ('office', 'NN'),
  ('please', 'NN'),
  ('call', 'NN'),
  ('questions', 'NNS'),
  ('joannie', 'NN')],
 [('drop', 'NN'),
  ('box', 'NN'),
  ('storage', 'NN'),
  ('house_analysismy', 'NN'),
  ('files', 'NNS'),
  ('last', 'JJ'),
  ('three', 'CD'),
  ('files', 'NNS'),
  ('dialog_box', 'VBP'),
  ('asks', 'JJ'),
  ('upadate_links', 'NNS'),
  ('click', 'VBP')],
 [('see', 'VB'),
  ('following', 'VBG'),
  ('instructions', 'NNS'),
  ('new', 'JJ'),
  ('messenging_board', 'NN'),
  ('youhave', 'VBP'),
  ('trouble_entering', 'VBG'),
  ('site', 'NN'),


In [27]:
#Above tags are a little bit confusing so we convert them to simpler tags with custom function 
#customized tagger function to convert tags to simpler form
def pos_tagger(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None


wordnet_pos_tagged = []

for p_tag in pos_tagged_data_words_bigrams:
    wordnet_pos_tagged.append(list(map(lambda x: (x[0], pos_tagger(x[1])), p_tag)))

wordnet_pos_tagged

[[('plan', 'n'),
  ('attend', 'v'),
  ('quarterly_managing', 'v'),
  ('director', 'n'),
  ('meeting', 'n'),
  ('scheduled', 'v'),
  ('monday', 'a'),
  ('october', 'n'),
  ('agenda', 'n'),
  ('distributed', 'v'),
  ('meeting', 'n'),
  ('monday', 'a'),
  ('october', 'n'),
  ('hyatt_regencydogwood', 'n'),
  ('room', 'n'),
  ('located', 'v'),
  ('rd_floor', 'a'),
  ('video_connection', 'n'),
  ('made', 'v'),
  ('london', 'a'),
  ('office', 'n'),
  ('please', 'n'),
  ('call', 'n'),
  ('questions', 'n'),
  ('joannie', 'n')],
 [('drop', 'n'),
  ('box', 'n'),
  ('storage', 'n'),
  ('house_analysismy', 'n'),
  ('files', 'n'),
  ('last', 'a'),
  ('three', None),
  ('files', 'n'),
  ('dialog_box', 'v'),
  ('asks', 'a'),
  ('upadate_links', 'n'),
  ('click', 'v')],
 [('see', 'v'),
  ('following', 'v'),
  ('instructions', 'n'),
  ('new', 'a'),
  ('messenging_board', 'n'),
  ('youhave', 'v'),
  ('trouble_entering', 'v'),
  ('site', 'n'),
  ('please', 'n'),
  ('let', 'v'),
  ('know_everyoneshould', '

In [28]:
def lemmatization(wordnet_pos_tagged):
    lemmatized_list = []
    for email_tokenized_list in wordnet_pos_tagged:
        temp = []
        for word, tag in email_tokenized_list:
            #if not tagged
            if tag is None:
                temp.append(word)
            #if tagged then lemmatize
            else:        
                temp.append(lemmatizer.lemmatize(word, tag))
        lemmatized_list.append(temp)
    return lemmatized_list


In [29]:
data_lemmatized = lemmatization(wordnet_pos_tagged)

In [30]:
data_lemmatized

[['plan',
  'attend',
  'quarterly_managing',
  'director',
  'meeting',
  'schedule',
  'monday',
  'october',
  'agenda',
  'distribute',
  'meeting',
  'monday',
  'october',
  'hyatt_regencydogwood',
  'room',
  'locate',
  'rd_floor',
  'video_connection',
  'make',
  'london',
  'office',
  'please',
  'call',
  'question',
  'joannie'],
 ['drop',
  'box',
  'storage',
  'house_analysismy',
  'file',
  'last',
  'three',
  'file',
  'dialog_box',
  'asks',
  'upadate_links',
  'click'],
 ['see',
  'follow',
  'instruction',
  'new',
  'messenging_board',
  'youhave',
  'trouble_entering',
  'site',
  'please',
  'let',
  'know_everyoneshould',
  'setup',
  'ready',
  'go',
  'add',
  'site',
  'one',
  'ofyour_favorites',
  'gas',
  'message',
  'board',
  'application',
  'run',
  'ie',
  'ie',
  'attach',
  'url',
  'user',
  'run',
  'application',
  'please',
  'feel_free',
  'forward',
  'url',
  'people',
  'like',
  'test_theapplication',
  'launch',
  'internet_explorercl

In [32]:
id2word = corpora.Dictionary(data_lemmatized)

In [33]:
texts = data_lemmatized

In [34]:
corpus = [id2word.doc2bow(text) for text in texts]

In [35]:
#Downloaded and unzipped mallet zip from the link: http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip

'wget' is not recognized as an internal or external command,
operable program or batch file.


In [36]:
mallet_path = 'mallet-2.0.8/bin/mallet'

In [37]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [38]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [39]:
# topic modeling
# corpus, dictionary and number of topics required for LDA
# alpha and eta are hyperparameters that affect sparsity of the topics
# chunksize is the number of documents to be used in each training chunk
# update_every determines how often the model parameters should be updated
# passes is the total number of training passes
# Print the Keyword in the 10 topics

In [40]:
print(lda_model.print_topics())# The weights reflect how important a keyword is to that topic.

[(0, '0.330*"cost" + 0.103*"collect" + 0.066*"ofthe" + 0.042*"upcoming" + 0.004*"tour" + 0.000*"rate" + 0.000*"power" + 0.000*"utility" + 0.000*"bill" + 0.000*"increase"'), (1, '0.395*"file" + 0.091*"box" + 0.085*"anything" + 0.067*"storage" + 0.050*"pending" + 0.015*"attendance" + 0.001*"asks" + 0.000*"doc" + 0.000*"pg" + 0.000*"commission"'), (2, '0.246*"find" + 0.165*"next" + 0.093*"tuesday" + 0.054*"jeffrey" + 0.051*"david" + 0.048*"west" + 0.034*"structure" + 0.022*"course" + 0.020*"learn" + 0.016*"season"'), (3, '0.582*"mail" + 0.082*"weather" + 0.021*"congratulation" + 0.017*"lisa" + 0.014*"whatever" + 0.005*"age" + 0.005*"promote" + 0.004*"carefully" + 0.004*"junk" + 0.002*"scroll"'), (4, '0.285*"thanks" + 0.133*"march" + 0.111*"september" + 0.079*"already" + 0.033*"drop" + 0.017*"eric" + 0.000*"doc" + 0.000*"pg" + 0.000*"energy" + 0.000*"issue"'), (5, '0.216*"future" + 0.177*"tomorrow" + 0.055*"brown" + 0.028*"cancel" + 0.018*"corner" + 0.013*"shortly" + 0.010*"stacey" + 0.003

In [41]:
doc_lda = lda_model[corpus]

In [42]:

# Model perplexity and topic coherence provide a convenient
# measure to judge how good a given topic model is.
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -23.06783847207837


In [43]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.37838199112045456


In [46]:
# Visualize the topics
pyLDAvis.enable_notebook(sort=True)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

In [47]:
pyLDAvis.display(vis)

In [53]:
# now using mallet
from gensim.test.utils import common_corpus, common_dictionary
from gensim.models.wrappers import LdaMallet
ldamallet = gensim.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)


ModuleNotFoundError: No module named 'gensim.models.wrappers'

In [None]:
# Show Topics
print(ldamallet.show_topics(formatted=False))

In [None]:
# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

In [None]:
# # Visualize the topics with mallet model
# pyLDAvis.enable_notebook(sort=True)
# vis = pyLDAvis.gensim.prepare(ldamallet, corpus, id2word)
# pyLDAvis.display(vis)

In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=num_topics, id2word=id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# run
model_list, coherence_values = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=data_lemmatized, start=2, limit=40, step=6)

In [None]:

# Show graph
limit=40; start=2; step=6;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

In [None]:
# Select the model and print the topics
optimal_model = model_list[4]
model_topics = optimal_model.show_topics(formatted=False)
print(optimal_model.print_topics(num_words=10))

In [None]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [None]:

df_topic_sents_keywords = format_topics_sentences(ldamodel=optimal_model, corpus=corpus, texts=data)

In [None]:

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

In [None]:
# Show
df_dominant_topic.head(10)

In [None]:
df_dominant_topic.Keywords.iloc[1]

In [None]:
df_dominant_topic.Text.iloc[1]

In [None]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()

sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

In [None]:
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

In [None]:
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]

In [45]:
# Show
# sent_topics_sorteddf_mallet

NameError: name 'sent_topics_sorteddf_mallet' is not defined