# Topic modelling LDA part
<br> Just to check what variables are needed to save. This jupyter notebook contains all the code for LDA and emails only


In [1]:
import pandas as pd
import pickle
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models  # the module 'gensim' has renamed to gensim_models
import dill



In [None]:
# Create a small set of emails (10 000)
"""emails_read_all = pd.read_csv('./csv/emails_df.csv')
enron = emails_read_all.head[:10000]
enron.to_csv('./csv/emails_df_10000.csv', index=False)"""

In [7]:
#enron = pd.read_csv('./csv/emails_df.csv')
enron = pd.read_csv('./emails.csv')

#enron = pd.read_csv('./csv/emails_df.csv')

## Load all the necessary files


In [8]:

with open("./calc_data/data_lemmatized.pkl", 'rb') as my_pickle:
    data_lemmatized = pickle.load(my_pickle)

with open("./calc_data/data.pkl", 'rb') as my_pickle:
    data = pickle.load(my_pickle)

#corpora is a dill file (module cannot be pickled)
with open("./calc_data/corpora.dill", 'rb') as f:
            corpora = dill.load(f)


In [9]:
#TAKE A SAMPLE OF DATA

enron = enron[:1000]
data_lemmatized = data_lemmatized[:1000]
data = data[:1000]


In [10]:
len(data_lemmatized)

1000

In [11]:
#emails_lemmatized = pd.DataFrame(data_lemmatized, columns=['lemmatized_subject'])
len(data_lemmatized)
new_column = pd.Series(data_lemmatized)
enron['body_lem'] = new_column

In [12]:
print(data_lemmatized[1])

['travel', 'business', 'meeting', 'take', 'fun', 'trip', 'especially', 'prepare', 'presentation', 'suggest', 'hold', 'business', 'plan', 'meeting', 'take', 'trip', 'formal', 'business', 'meeting', 'even', 'try', 'get', 'honest', 'opinion', 'trip', 'even', 'desire', 'necessary', 'far', 'business', 'meeting', 'think', 'productive', 'try', 'stimulate', 'discussion', 'different', 'group', 'work', 'often', 'presenter', 'speak', 'other', 'quiet', 'wait', 'turn', 'meeting', 'well', 'hold', 'round', 'table', 'discussion', 'format', 'suggestion', 'go', 'play', 'golf', 'rent', 'boat', 'jet', 'ski', 'fly', 'somewhere', 'take', 'much', 'time']


In [13]:
# create dictionary and corpus both are needed for (LDA) topic modeling

# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
mallet_path = './mallet-2.0.8/bin/mallet'

In [None]:
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [14]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=3,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [15]:
print((lda_model))

LdaModel(num_terms=7610, num_topics=3, decay=0.5, chunksize=100)


In [None]:
# topic modeling
# corpus, dictionary and number of topics required for LDA
# alpha and eta are hyperparameters that affect sparsity of the topics
# chunksize is the number of documents to be used in each training chunk
# update_every determines how often the model parameters should be updated
# passes is the total number of training passes
# Print the Keyword in the 10 topics

In [16]:
print(lda_model.print_topics())# The weights reflect how important a keyword is to that topic.

[(0, '0.019*"buy" + 0.013*"http" + 0.008*"service" + 0.007*"downgrade" + 0.007*"gas" + 0.007*"free" + 0.007*"zdnet" + 0.007*"market" + 0.007*"cgi_zdnet" + 0.007*"make"'), (1, '0.132*"pm" + 0.045*"ect" + 0.019*"hou" + 0.018*"request" + 0.017*"ee" + 0.013*"d" + 0.013*"i" + 0.012*"image" + 0.011*"mail" + 0.010*"showtime"'), (2, '0.045*"image" + 0.009*"email" + 0.009*"phillip" + 0.008*"receive" + 0.007*"message" + 0.007*"get" + 0.006*"thank" + 0.006*"need" + 0.006*"week" + 0.006*"send"')]


In [17]:
doc_lda = lda_model[corpus]

In [None]:
# Model perplexity and topic coherence provide a convenient
# measure to judge how good a given topic model is.
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

In [None]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook(sort=True)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)

In [None]:
print(lda_model.show_topic(0))

In [None]:
pyLDAvis.display(vis)

## How to get top emails per topic?
There are actually two cases:
<br> a. Best email to given topic
<br> b. Dominant topic for a given email


In [18]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()


    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        #print("i: ", i)
        #print("row: ", row)
        #print("row[1]: ", row[1])
        row = sorted(
            row[0], key=lambda x: (x[1]), 
            reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

In [19]:
print(type(lda_model[corpus]))
print(len(lda_model[corpus]))
print("corpus type:", type(corpus))
print("corpus len", len(corpus))

<class 'gensim.interfaces.TransformedCorpus'>
1000
corpus type: <class 'list'>
corpus len 1000


In [20]:
df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data)

In [45]:
df_topic_sents_keywords["binary"].iloc[:250] = "000"
df_topic_sents_keywords["binary"].iloc[250:500] = "001"
df_topic_sents_keywords["binary"].iloc[500:750] = "010"
df_topic_sents_keywords["binary"].iloc[750:1000] = "011"
df_topic_sents_keywords.head()
df_topic_sents_keywords["Topic_Keywords"].loc[df_topic_sents_keywords["binary"] == "000"].unique()

array(['buy, http, service, downgrade, gas, free, zdnet, market, cgi_zdnet, make',
       'image, email, phillip, receive, message, get, thank, need, week, send',
       'pm, ect, hou, request, ee, d, i, image, mail, showtime'],
      dtype=object)

In [None]:
with open(f'./csv/df_topic_sents_keywords.pkl', 'wb') as my_pickle:
                pickle.dump(data_lemmatized, my_pickle)

In [29]:

df_topic_sents_keywords.to_csv("./sample.csv")

In [None]:
df_topic_sents_keywords['Dominant_Topic'].value_counts()

In [None]:
topic_9 = df_topic_sents_keywords[df_topic_sents_keywords['Dominant_Topic'] == 0].sort_values(by=['Perc_Contribution'], ascending=False)
topic_9.head()
topic_9.rename(columns={0: 'Text'}, inplace=True)
topic_9["Topic_Keywords"].unique()

In [None]:
messages = topic_9[0:10]['Text'].to_list()
messages[3]


In [83]:
sample = pd.read_csv('./result.csv')
sample.head()

Unnamed: 0,Lemmatized_Text,ID,Subject,enron_content,From,To,Cc,Bcc,Topic_hier,Perc_list,Dominant_Topic,Perc_Contribution,Topic_Keywords,0
0,"['forward', 'shockwave', 'team', 'net', 'respo...",<20615179.1075855692774.JavaMail.evans@thyme>,Stick it in your Shockmachine!,---------------------- Forwarded by Phillip K ...,frozenset({'phillip.allen@enron.com'}),frozenset({'pallen70@hotmail.com'}),,,"[0, 0, 0, 0, 0, 0, 1]","[0.5615000128746033, 0.5432999730110168, 0.856...",1,0.7768,"heraldnet, man, say, get, new, story, year, go...","['forward', 'shockwave', 'team', 'net', 'respo..."
1,"['money', 'silent', 'second', 'regular', 'mail...",<1199774.1075855724555.JavaMail.evans@thyme>,Re: FW: 2nd lien info. and private lien info -...,How am I to send them the money for the silent...,frozenset({'phillip.allen@enron.com'}),frozenset({'jsmith@austintx.com'}),,,"[0, 0, 0, 0, 0, 0, 1]","[0.9959999918937683, 0.6647999882698059, 0.573...",1,0.5495,"heraldnet, man, say, get, new, story, year, go...","['money', 'silent', 'second', 'regular', 'mail..."
2,"['cary', 'picture', 'house', 'mind', 'go', 'si...",<8520345.1075855725198.JavaMail.evans@thyme>,,"Cary,\n\nHere is the picture of the house I ha...",frozenset({'phillip.allen@enron.com'}),frozenset({'scfatkfa@caprock.net'}),,,"[0, 0, 0, 0, 0, 0, 0]","[0.892799973487854, 0.5792999863624573, 0.5759...",0,0.5081,"say, go, people, get, make, year, think, time,...","['cary', 'picture', 'house', 'mind', 'go', 'si..."
3,"['cary', 'picture', 'house', 'mind', 'go', 'si...",<28589107.1075855725265.JavaMail.evans@thyme>,,"Cary,\n\nHere is the picture of the house I ha...",frozenset({'phillip.allen@enron.com'}),frozenset({'scsatkfa@caprock.net'}),,,"[0, 0, 0, 0, 0, 0, 0]","[0.892799973487854, 0.5792999863624573, 0.5759...",0,0.5081,"say, go, people, get, make, year, think, time,...","['cary', 'picture', 'house', 'mind', 'go', 'si..."
4,"['thank', 'quick', 'response', 'bid', 'residen...",<24048786.1075855725309.JavaMail.evans@thyme>,,"Reagan,\n\nThank you for the quick response on...",frozenset({'phillip.allen@enron.com'}),frozenset({'rlehmann@yahoo.com'}),,,"[0, 0, 0, 0, 0, 0, 1]","[0.7971000075340271, 0.6256999969482422, 0.550...",1,0.9988,"heraldnet, man, say, get, new, story, year, go...","['thank', 'quick', 'response', 'bid', 'residen..."


In [91]:
sample["Topic_Keywords"].loc[sample["Topic_hier"].str.startswith('[0') == True].unique()[10]

'image, say, company, stock, year, market, earning, last, time, price'

In [85]:
 df2 = sample.drop(columns=['Dominant_Topic'])

In [86]:
df2

Unnamed: 0,Lemmatized_Text,ID,Subject,enron_content,From,To,Cc,Bcc,Topic_hier,Perc_list,Perc_Contribution,Topic_Keywords,0
0,"['forward', 'shockwave', 'team', 'net', 'respo...",<20615179.1075855692774.JavaMail.evans@thyme>,Stick it in your Shockmachine!,---------------------- Forwarded by Phillip K ...,frozenset({'phillip.allen@enron.com'}),frozenset({'pallen70@hotmail.com'}),,,"[0, 0, 0, 0, 0, 0, 1]","[0.5615000128746033, 0.5432999730110168, 0.856...",0.7768,"heraldnet, man, say, get, new, story, year, go...","['forward', 'shockwave', 'team', 'net', 'respo..."
1,"['money', 'silent', 'second', 'regular', 'mail...",<1199774.1075855724555.JavaMail.evans@thyme>,Re: FW: 2nd lien info. and private lien info -...,How am I to send them the money for the silent...,frozenset({'phillip.allen@enron.com'}),frozenset({'jsmith@austintx.com'}),,,"[0, 0, 0, 0, 0, 0, 1]","[0.9959999918937683, 0.6647999882698059, 0.573...",0.5495,"heraldnet, man, say, get, new, story, year, go...","['money', 'silent', 'second', 'regular', 'mail..."
2,"['cary', 'picture', 'house', 'mind', 'go', 'si...",<8520345.1075855725198.JavaMail.evans@thyme>,,"Cary,\n\nHere is the picture of the house I ha...",frozenset({'phillip.allen@enron.com'}),frozenset({'scfatkfa@caprock.net'}),,,"[0, 0, 0, 0, 0, 0, 0]","[0.892799973487854, 0.5792999863624573, 0.5759...",0.5081,"say, go, people, get, make, year, think, time,...","['cary', 'picture', 'house', 'mind', 'go', 'si..."
3,"['cary', 'picture', 'house', 'mind', 'go', 'si...",<28589107.1075855725265.JavaMail.evans@thyme>,,"Cary,\n\nHere is the picture of the house I ha...",frozenset({'phillip.allen@enron.com'}),frozenset({'scsatkfa@caprock.net'}),,,"[0, 0, 0, 0, 0, 0, 0]","[0.892799973487854, 0.5792999863624573, 0.5759...",0.5081,"say, go, people, get, make, year, think, time,...","['cary', 'picture', 'house', 'mind', 'go', 'si..."
4,"['thank', 'quick', 'response', 'bid', 'residen...",<24048786.1075855725309.JavaMail.evans@thyme>,,"Reagan,\n\nThank you for the quick response on...",frozenset({'phillip.allen@enron.com'}),frozenset({'rlehmann@yahoo.com'}),,,"[0, 0, 0, 0, 0, 0, 1]","[0.7971000075340271, 0.6256999969482422, 0.550...",0.9988,"heraldnet, man, say, get, new, story, year, go...","['thank', 'quick', 'response', 'bid', 'residen..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,"['head', 'title', 'mail', 'type', 'content', '...",,,,,,,,"[1, 1, 1, 1, 1, 1, 1]","[0.9721999764442444, 0.6927000284194946, 1.0, ...",1.0000,"font, size, br, tr, img_src, http, image, gif,...","['head', 'title', 'mail', 'type', 'content', '..."
29996,"['sorry', 'look', 'problem', 'view', 'email', ...",,,,,,,,"[1, 1, 1, 1, 1, 1, 1]","[0.9538000226020813, 0.7264999747276306, 0.994...",1.0000,"font, size, br, tr, img_src, http, image, gif,...","['sorry', 'look', 'problem', 'view', 'email', ..."
29997,"['funjet_vacation', 'special', 'send', 'email'...",,,,,,,,"[1, 1, 1, 1, 1, 1, 1]","[0.9591000080108643, 0.703000009059906, 0.9968...",0.9834,"font, size, br, tr, img_src, http, image, gif,...","['funjet_vacation', 'special', 'send', 'email'..."
29998,"['hot', 'pick', 'come', 'visit', 'body', 'tabl...",,,,,,,,"[1, 1, 1, 1, 1, 1, 1]","[0.9179999828338623, 0.765500009059906, 1.0, 0...",1.0000,"font, size, br, tr, img_src, http, image, gif,...","['hot', 'pick', 'come', 'visit', 'body', 'tabl..."
