In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# reading the pickle file which contains cleaned data
email_data = pd.read_pickle('corpus.pkl')
email_data.head()

Unnamed: 0,from,to,email
0,tana.jones@enron.com,alicia.goodrow@enron.com,nice dinner probably knowanyone else anytime w...
1,Sheila Tweed@ECT on 05/15/2001 06,Kay Mann/Corp/Enron@ENRON,absolutely good point peter start draft overri...
2,jeff.dasovich@enron.com,christine.piesco@oracle.com,apology schedule melted talked monday swhere f...
3,tanya.tamarchenko@enron.com,"Richard Lewis/LON/ECT@ECT, James New/LON/ECT@E...",vince uk var breached limit last week uk trade...
4,kay.mann@enron.com,"Don Hammond/PDX/ECT@ECT, Jody Blackburn/PDX/EC...",problem comment dale_rasmussen ectmann corp en...


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['aa','ab','ac','yo','yr','yf','wr','u'])

cv=CountVectorizer(stop_words=stop_words,max_df=0.95,min_df=25)
email_cv=cv.fit_transform(email_data.email)
email_dtm = pd.DataFrame(email_cv.toarray(), columns=cv.get_feature_names())


In [4]:
email_dtm

Unnamed: 0,aaron,ability,able,absence,absolutely,academic,accept,acceptable,accepted,accepting,...,yesterday,yet,yield,york,youare,youcan,youhave,young,youwill,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10343,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10345,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from gensim import matutils,models
import scipy.sparse
import gensim.corpora as corpora

data_cleaned = email_data.email.apply(lambda x: x.split(' '))

In [6]:
data_cleaned[1]

['absolutely',
 'good',
 'point',
 'peter',
 'start',
 'draft',
 'override_letter',
 'kay_mann',
 'enron',
 'development',
 'enron',
 'development',
 'john',
 'grigby',
 'enron',
 'development',
 'enron',
 'developmentas',
 'reminder',
 'u',
 'need',
 'form',
 'override_letter',
 'go',
 'withthe',
 'form',
 'turbine',
 'contract',
 'kay']

In [7]:
# Create the vocabulary dictionary
id2word = corpora.Dictionary(data_cleaned)

# Create the gensim corpus
corpus = [id2word.doc2bow(text) for text in data_cleaned]

In [8]:
# LDA model
lda = models.LdaModel(corpus=corpus, num_topics=15, random_state=100, id2word=id2word, passes=10)
lda.print_topics()

[(0,
  '0.023*"updated" + 0.018*"yard" + 0.018*"game" + 0.018*"error" + 0.016*"borland_databaseengine" + 0.016*"initialize" + 0.016*"error_occurred" + 0.016*"while_attempting" + 0.014*"wr" + 0.013*"fantasy"'),
 (1,
  '0.020*"said" + 0.017*"state" + 0.013*"enron" + 0.013*"power" + 0.012*"california" + 0.011*"energy" + 0.008*"year" + 0.007*"utility" + 0.007*"company" + 0.006*"price"'),
 (2,
  '0.015*"market" + 0.014*"company" + 0.009*"power" + 0.007*"energy" + 0.007*"service" + 0.005*"new" + 0.005*"said" + 0.005*"area" + 0.005*"trading" + 0.005*"customer"'),
 (3,
  '0.024*"price" + 0.017*"mw" + 0.010*"gas" + 0.010*"na" + 0.009*"unit" + 0.009*"feb" + 0.008*"peak" + 0.008*"index" + 0.008*"mwh" + 0.007*"jan"'),
 (4,
  '0.013*"please" + 0.013*"know" + 0.012*"thanks" + 0.011*"original_message" + 0.010*"get" + 0.008*"need" + 0.007*"call" + 0.007*"time" + 0.007*"enron" + 0.006*"meeting"'),
 (5,
  '0.046*"each_way" + 0.022*"tx" + 0.007*"call" + 0.006*"enw" + 0.006*"network" + 0.005*"tax" + 0.005

In [9]:
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=data_cleaned, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -10.126947024549315

Coherence Score:  0.5960825197042026


In [10]:
import pyLDAvis
from pyLDAvis import gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Visualize the topics
plt.figure(figsize=(10,3))
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, id2word)
vis

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


<Figure size 720x216 with 0 Axes>

In [11]:
# LDA mallet model
import gensim
import os

  and should_run_async(code)


In [12]:
os.environ.update({'MALLET_HOME': r'C:/Users/nivey/new_mallet/mallet-2.0.8/'})
mallet_path = 'C:\\Users\\nivey\\new_mallet\\mallet-2.0.8\\bin\\mallet.bat'  # update this path

lda_mallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=14, id2word=id2word,random_seed=100)

  and should_run_async(code)


In [13]:
result = lda_mallet.show_topics(num_topics=17,formatted=True)
for each in result:
    print (each)

(0, '0.054*"mail" + 0.011*"enron" + 0.010*"john" + 0.010*"energy" + 0.008*"net" + 0.007*"org" + 0.007*"bill" + 0.007*"williams" + 0.007*"gov" + 0.006*"mike"')
(1, '0.024*"agreement" + 0.011*"attached" + 0.010*"comment" + 0.010*"draft" + 0.010*"credit" + 0.010*"doc" + 0.010*"error" + 0.010*"letter" + 0.009*"change" + 0.008*"legal"')
(2, '0.029*"power" + 0.020*"energy" + 0.016*"state" + 0.014*"california" + 0.011*"utility" + 0.009*"price" + 0.008*"market" + 0.008*"electricity" + 0.007*"plant" + 0.007*"cost"')
(3, '0.203*"ect" + 0.107*"hou" + 0.082*"enron" + 0.049*"corp" + 0.023*"development" + 0.018*"forwarded" + 0.011*"ees_ees" + 0.011*"enronxgate" + 0.010*"lon" + 0.008*"communication"')
(4, '0.020*"company" + 0.011*"market" + 0.009*"stock" + 0.009*"year" + 0.009*"trading" + 0.007*"share" + 0.007*"business" + 0.006*"financial" + 0.006*"week" + 0.005*"dynegy"')
(5, '0.013*"week" + 0.009*"game" + 0.008*"team" + 0.007*"updated" + 0.007*"travel" + 0.006*"each_way" + 0.006*"texas" + 0.006*"c

  and should_run_async(code)


In [14]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_mallet, texts=data_cleaned, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

  and should_run_async(code)



Coherence Score:  0.5808557164222214


In [15]:
email_df=pd.read_csv('email_df.csv')
email_df.head()

  and should_run_async(code)


Unnamed: 0,from,to,email
0,tana.jones@enron.com,alicia.goodrow@enron.com,nice dinner probably knowanyone else anytime w...
1,Sheila Tweed@ECT on 05/15/2001 06,Kay Mann/Corp/Enron@ENRON,absolutely good point peter start draft overri...
2,jeff.dasovich@enron.com,christine.piesco@oracle.com,apology schedule melted talked monday swhere f...
3,tanya.tamarchenko@enron.com,"Richard Lewis/LON/ECT@ECT, James New/LON/ECT@E...",vince uk var breached limit last week uk trade...
4,kay.mann@enron.com,"Don Hammond/PDX/ECT@ECT, Jody Blackburn/PDX/EC...",problem comment dale_rasmussen ectmann corp en...


In [21]:
def match_dominant_topic(ldamodel=lda, corpus=corpus, texts=data_cleaned):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

  and should_run_async(code)


In [22]:
df_topic_sents_keywords = match_dominant_topic(ldamodel=lda, corpus=corpus, texts=data_cleaned)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

  and should_run_async(code)


In [23]:
email_df=pd.concat([email_df,df_dominant_topic],axis=1)

  and should_run_async(code)


In [24]:
email_df.head()

  and should_run_async(code)


Unnamed: 0,from,to,email,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text,Document_No.1,Dominant_Topic.1,Topic_Perc_Contrib.1,Keywords.1,Text.1
0,tana.jones@enron.com,alicia.goodrow@enron.com,nice dinner probably knowanyone else anytime w...,0,8.0,0.1321,"good, time, back, day, people, thing, make, gr...","[nice, dinner, probably, knowanyone, else, any...",0,4.0,0.5469,"please, know, thanks, original_message, get, n...","[nice, dinner, probably, knowanyone, else, any..."
1,Sheila Tweed@ECT on 05/15/2001 06,Kay Mann/Corp/Enron@ENRON,absolutely good point peter start draft overri...,1,1.0,0.1596,"agreement, attached, comment, draft, credit, d...","[absolutely, good, point, peter, start, draft,...",1,4.0,0.5762,"please, know, thanks, original_message, get, n...","[absolutely, good, point, peter, start, draft,..."
2,jeff.dasovich@enron.com,christine.piesco@oracle.com,apology schedule melted talked monday swhere f...,2,8.0,0.1985,"good, time, back, day, people, thing, make, gr...","[apology, schedule, melted, talked, monday, sw...",2,4.0,0.8319,"please, know, thanks, original_message, get, n...","[apology, schedule, melted, talked, monday, sw..."
3,tanya.tamarchenko@enron.com,"Richard Lewis/LON/ECT@ECT, James New/LON/ECT@E...",vince uk var breached limit last week uk trade...,3,6.0,0.4106,"deal, gas, day, price, contract, month, capaci...","[vince, uk, var, breached, limit, last, week, ...",3,2.0,0.4547,"market, company, power, energy, service, new, ...","[vince, uk, var, breached, limit, last, week, ..."
4,kay.mann@enron.com,"Don Hammond/PDX/ECT@ECT, Jody Blackburn/PDX/EC...",problem comment dale_rasmussen ectmann corp en...,4,1.0,0.3459,"agreement, attached, comment, draft, credit, d...","[problem, comment, dale_rasmussen, ectmann, co...",4,4.0,0.4055,"please, know, thanks, original_message, get, n...","[problem, comment, dale_rasmussen, ectmann, co..."


In [25]:
email_df.to_csv('email_df_final.csv',index=False)

  and should_run_async(code)
