# Topic modelling LDA part
<br> Just to check what variables are needed to save. This jupyter notebook contains all the code for LDA and emails only


In [2]:
import pandas as pd
import pickle
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models  # the module 'gensim' has renamed to gensim_models
import dill

In [2]:
# Create a small set of emails (10 000)
"""emails_read_all = pd.read_csv('./csv/emails_df.csv')
enron = emails_read_all.head[:10000]
enron.to_csv('./csv/emails_df_10000.csv', index=False)"""

"emails_read_all = pd.read_csv('./csv/emails_df.csv')\nenron = emails_read_all.head[:10000]\nenron.to_csv('./csv/emails_df_10000.csv', index=False)"

In [4]:
#enron = pd.read_csv('./csv/emails_df.csv')
enron = pd.read_csv('./emails.csv')

#enron = pd.read_csv('./csv/emails_df.csv')

## Load all the necessary files


In [5]:

with open("./calc_data/data_lemmatized.pkl", 'rb') as my_pickle:
    data_lemmatized = pickle.load(my_pickle)

with open("./calc_data/data.pkl", 'rb') as my_pickle:
    data = pickle.load(my_pickle)

#corpora is a dill file (module cannot be pickled)
with open("./calc_data/corpora.dill", 'rb') as f:
            corpora = dill.load(f)


In [6]:
#TAKE A SAMPLE OF DATA

enron = enron[:1000]
data_lemmatized = data_lemmatized[:1000]
data = data[:1000]


In [7]:
len(data_lemmatized)

1000

In [8]:
#emails_lemmatized = pd.DataFrame(data_lemmatized, columns=['lemmatized_subject'])
len(data_lemmatized)
new_column = pd.Series(data_lemmatized)
enron['body_lem'] = new_column

In [9]:
print(data_lemmatized[1])

['travel', 'business', 'meeting', 'take', 'fun', 'trip', 'especially', 'prepare', 'presentation', 'suggest', 'hold', 'business', 'plan', 'meeting', 'take', 'trip', 'formal', 'business', 'meeting', 'even', 'try', 'get', 'honest', 'opinion', 'trip', 'even', 'desire', 'necessary', 'far', 'business', 'meeting', 'think', 'productive', 'try', 'stimulate', 'discussion', 'different', 'group', 'work', 'often', 'presenter', 'speak', 'other', 'quiet', 'wait', 'turn', 'meeting', 'well', 'hold', 'round', 'table', 'discussion', 'format', 'suggestion', 'go', 'play', 'golf', 'rent', 'boat', 'jet', 'ski', 'fly', 'somewhere', 'take', 'much', 'time']


In [10]:
# create dictionary and corpus both are needed for (LDA) topic modeling

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [11]:
mallet_path = './mallet-2.0.8/bin/mallet'

In [12]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [13]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [14]:
print((lda_model))

LdaModel(num_terms=7610, num_topics=3, decay=0.5, chunksize=100)


In [15]:
# topic modeling
# corpus, dictionary and number of topics required for LDA
# alpha and eta are hyperparameters that affect sparsity of the topics
# chunksize is the number of documents to be used in each training chunk
# update_every determines how often the model parameters should be updated
# passes is the total number of training passes
# Print the Keyword in the 10 topics

In [16]:
print(lda_model.print_topics())# The weights reflect how important a keyword is to that topic.

[(0, '0.019*"buy" + 0.013*"http" + 0.008*"service" + 0.007*"downgrade" + 0.007*"gas" + 0.007*"free" + 0.007*"zdnet" + 0.007*"market" + 0.007*"cgi_zdnet" + 0.007*"make"'), (1, '0.132*"pm" + 0.045*"ect" + 0.019*"hou" + 0.018*"request" + 0.017*"ee" + 0.013*"d" + 0.013*"i" + 0.012*"image" + 0.011*"mail" + 0.010*"showtime"'), (2, '0.045*"image" + 0.009*"email" + 0.009*"phillip" + 0.008*"receive" + 0.007*"message" + 0.007*"get" + 0.006*"thank" + 0.006*"need" + 0.006*"week" + 0.006*"send"')]


In [17]:
doc_lda = lda_model[corpus]

In [18]:
# Model perplexity and topic coherence provide a convenient
# measure to judge how good a given topic model is.
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.


Perplexity:  -7.4032980835624445


In [19]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.3988310412532785


In [20]:
# Visualize the topics
pyLDAvis.enable_notebook(sort=True)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

  default_term_info = default_term_info.sort_values(


In [21]:
print(lda_model.show_topic(0))

[('buy', 0.019218009), ('http', 0.013064621), ('service', 0.007883714), ('downgrade', 0.0073966244), ('gas', 0.007349918), ('free', 0.006783965), ('zdnet', 0.0067439065), ('market', 0.006661999), ('cgi_zdnet', 0.0066446518), ('make', 0.0066061183)]


In [22]:
pyLDAvis.display(vis)

## How to get top emails per topic?
There are actually two cases:
<br> a. Best email to given topic
<br> b. Dominant topic for a given email


In [23]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()


    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        #print("i: ", i)
        #print("row: ", row)
        #print("row[1]: ", row[1])
        row = sorted(
            row[0], key=lambda x: (x[1]), 
            reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [24]:
print(type(lda_model[corpus]))
print(len(lda_model[corpus]))
print("corpus type:", type(corpus))
print("corpus len", len(corpus))

<class 'gensim.interfaces.TransformedCorpus'>
1000
corpus type: <class 'list'>
corpus len 1000


In [61]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

df_topic_sents_keywords

Unnamed: 0,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,0.0,0.7212,"buy, http, service, downgrade, gas, free, zdne...",Here is our forecast\n\n
1,2.0,0.5724,"image, email, phillip, receive, message, get, ...",Traveling to have a business meeting takes the...
2,2.0,0.9468,"image, email, phillip, receive, message, get, ...",test successful. way to go!!!
3,2.0,0.9789,"image, email, phillip, receive, message, get, ...","Randy,\n\n Can you send me a schedule of the s..."
4,2.0,0.9057,"image, email, phillip, receive, message, get, ...",Let's shoot for Tuesday at 11:45.
...,...,...,...,...
995,2.0,0.9869,"image, email, phillip, receive, message, get, ...",There are three other deals that I will fax to...
996,2.0,0.9531,"image, email, phillip, receive, message, get, ...","\nPhillip,\n \nCould you please do me a favor?..."
997,2.0,0.8394,"image, email, phillip, receive, message, get, ...","Phillip,\n\nPursuant to your request, please s..."
998,2.0,0.9892,"image, email, phillip, receive, message, get, ...","<<3MMP10!.DOC>>\nPhillip,\n\nEnclosed please ..."


In [26]:
with open(f'./csv/df_topic_sents_keywords.pkl', 'wb') as my_pickle:
                pickle.dump(data_lemmatized, my_pickle)

FileNotFoundError: [Errno 2] No such file or directory: './csv/df_topic_sents_keywords.pkl'

In [33]:

df_topic_sents_keywords.to_csv("./sample.csv")

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


In [62]:
df_topic_sents_keywords['Dominant_Topic'].value_counts()

2.0    656
0.0    179
1.0    165
Name: Dominant_Topic, dtype: int64

In [68]:
topic_9 = df_topic_sents_keywords[df_topic_sents_keywords['Dominant_Topic'] == 0].sort_values(by=['Perc_Contribution'], ascending=False)
topic_9.head()
topic_9.rename(columns={0: 'Text'}, inplace=True)
topic_9["Topic_Keywords"].unique()

array(['buy, http, service, downgrade, gas, free, zdnet, market, cgi_zdnet, make'],
      dtype=object)

In [60]:
messages = topic_9[0:10]['Text'].to_list()
messages[3]


'Dear Salomon Smith Barney Client:\n\nYour Salomon Smith Barney trade confirmation(s) has been delivered to Salomon Smith Barney Access for online viewing. To view your trade confirmation(s) online, click on the link below. You will be required to enter your Salomon Smith Barney Access User Name and Password.\n\nhttps://www.salomonsmithbarney.com/cgi-bin/edelivery/econfirm.pl?47d10745b585f445e58545143323030313\n\nNote: If you cannot access your confirmation through the link provided in this e-mail, "cut and paste" or type the full URL into your browser. You can also choose to view your confirmations directly from your Salomon Smith Barney Access Portfolio page by clicking the Portfolio tab and selecting "Confirms".\n\nAny prospectuses related to these trade confirmations will be sent under separate cover. If you opted to receive your prospectuses online, you will receive an e-mail notice when they are available for online viewing.\n\nIf you are experiencing difficulty when viewing your