In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [2]:
# reading the pickle file which contains cleaned data
email_data = pd.read_pickle('corpus.pkl')
email_data.head()

Unnamed: 0,from,to,email
0,tana.jones@enron.com,alicia.goodrow@enron.com,nice dinner probably knowanyone else anytime w...
1,Sheila Tweed@ECT on 05/15/2001 06,Kay Mann/Corp/Enron@ENRON,absolutely good point peter start draft overri...
2,jeff.dasovich@enron.com,christine.piesco@oracle.com,apology schedule melted talked monday swhere f...
3,tanya.tamarchenko@enron.com,"Richard Lewis/LON/ECT@ECT, James New/LON/ECT@E...",vince uk var breached limit last week uk trade...
4,kay.mann@enron.com,"Don Hammond/PDX/ECT@ECT, Jody Blackburn/PDX/EC...",problem comment dale_rasmussen ectmann corp en...


In [3]:
# Count vectorizer to convert words into frequency matrix
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer(max_df=0.95,min_df=25)
email_cv=cv.fit_transform(email_data.email)
email_dtm = pd.DataFrame(email_cv.toarray(), columns=cv.get_feature_names())

In [4]:
email_dtm

Unnamed: 0,aaron,ability,able,absence,absolutely,academic,accept,acceptable,accepted,accepting,...,yesterday,yet,yield,york,youare,youcan,youhave,young,youwill,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10343,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10344,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10345,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10346,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
from gensim import matutils,models
import scipy.sparse
import gensim.corpora as corpora

data_cleaned = email_data.email.apply(lambda x: x.split(' '))

In [6]:
data_cleaned[1]

['absolutely',
 'good',
 'point',
 'peter',
 'start',
 'draft',
 'override_letter',
 'kay_mann',
 'enron',
 'development',
 'enron',
 'development',
 'john',
 'grigby',
 'enron',
 'development',
 'enron',
 'developmentas',
 'reminder',
 'u',
 'need',
 'form',
 'override_letter',
 'go',
 'withthe',
 'form',
 'turbine',
 'contract',
 'kay']

In [7]:
# Create the vocabulary dictionary
id2word = corpora.Dictionary(data_cleaned)

# Create the gensim corpus
corpus = [id2word.doc2bow(text) for text in data_cleaned]

In [8]:
# LDA model
lda = models.LdaModel(corpus=corpus, num_topics=15, random_state=100, id2word=id2word, passes=10)
lda.print_topics()

[(0,
  '0.009*"faxed" + 0.005*"enovate" + 0.004*"ecn" + 0.004*"seabron_adamson" + 0.004*"attachments_thereto" + 0.004*"hp" + 0.003*"benjamin" + 0.003*"frontier_economics" + 0.003*"shuttle" + 0.003*"greg_brazaitis"'),
 (1,
  '0.013*"power" + 0.010*"market" + 0.010*"energy" + 0.009*"state" + 0.008*"price" + 0.007*"california" + 0.005*"gas" + 0.005*"cost" + 0.005*"year" + 0.005*"rate"'),
 (2,
  '0.021*"original_message" + 0.015*"know" + 0.012*"thanks" + 0.012*"get" + 0.011*"please" + 0.010*"message" + 0.008*"email" + 0.008*"time" + 0.008*"mail" + 0.008*"call"'),
 (3,
  '0.022*"please" + 0.014*"thanks" + 0.011*"enron" + 0.010*"need" + 0.010*"know" + 0.010*"deal" + 0.008*"attached" + 0.008*"question" + 0.007*"agreement" + 0.007*"review"'),
 (4,
  '0.040*"font" + 0.032*"font_face" + 0.030*"br" + 0.022*"td_tr" + 0.018*"table" + 0.016*"tr_td" + 0.015*"serif_size" + 0.013*"table_width" + 0.013*"size" + 0.012*"td_td"'),
 (5,
  '0.010*"information" + 0.008*"credit" + 0.006*"mail" + 0.006*"transac

In [9]:
from gensim.models import CoherenceModel
# Compute Perplexity
print('\nPerplexity: ', lda.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda, texts=data_cleaned, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -10.143202995101415

Coherence Score:  0.5857120421330922


In [10]:
# LDA visualaization
import pyLDAvis
from pyLDAvis import gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

# Visualize the topics
plt.figure(figsize=(10,3))
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, id2word)
vis

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  EPS = np.finfo(np.float).eps


<Figure size 720x216 with 0 Axes>

In [11]:
email_df=pd.read_csv('email_df.csv')
email_df.head()

  and should_run_async(code)


Unnamed: 0,from,to,email
0,tana.jones@enron.com,alicia.goodrow@enron.com,nice dinner probably knowanyone else anytime w...
1,Sheila Tweed@ECT on 05/15/2001 06,Kay Mann/Corp/Enron@ENRON,absolutely good point peter start draft overri...
2,jeff.dasovich@enron.com,christine.piesco@oracle.com,apology schedule melted talked monday swhere f...
3,tanya.tamarchenko@enron.com,"Richard Lewis/LON/ECT@ECT, James New/LON/ECT@E...",vince uk var breached limit last week uk trade...
4,kay.mann@enron.com,"Don Hammond/PDX/ECT@ECT, Jody Blackburn/PDX/EC...",problem comment dale_rasmussen ectmann corp en...


In [12]:
def match_dominant_topic(ldamodel=lda, corpus=corpus, texts=data_cleaned):
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

  and should_run_async(code)


In [13]:
df_topic_sents_keywords = match_dominant_topic(ldamodel=lda, corpus=corpus, texts=data_cleaned)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

  and should_run_async(code)


In [14]:
email_df=pd.concat([email_df,df_dominant_topic],axis=1)

  and should_run_async(code)


In [15]:
email_df.head()

  and should_run_async(code)


Unnamed: 0,from,to,email,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,tana.jones@enron.com,alicia.goodrow@enron.com,nice dinner probably knowanyone else anytime w...,0,2.0,0.9333,"original_message, know, thanks, get, please, m...","[nice, dinner, probably, knowanyone, else, any..."
1,Sheila Tweed@ECT on 05/15/2001 06,Kay Mann/Corp/Enron@ENRON,absolutely good point peter start draft overri...,1,3.0,0.7429,"please, thanks, enron, need, know, deal, attac...","[absolutely, good, point, peter, start, draft,..."
2,jeff.dasovich@enron.com,christine.piesco@oracle.com,apology schedule melted talked monday swhere f...,2,2.0,0.6207,"original_message, know, thanks, get, please, m...","[apology, schedule, melted, talked, monday, sw..."
3,tanya.tamarchenko@enron.com,"Richard Lewis/LON/ECT@ECT, James New/LON/ECT@E...",vince uk var breached limit last week uk trade...,3,1.0,0.6694,"power, market, energy, state, price, californi...","[vince, uk, var, breached, limit, last, week, ..."
4,kay.mann@enron.com,"Don Hammond/PDX/ECT@ECT, Jody Blackburn/PDX/EC...",problem comment dale_rasmussen ectmann corp en...,4,3.0,0.6876,"please, thanks, enron, need, know, deal, attac...","[problem, comment, dale_rasmussen, ectmann, co..."


In [16]:
email_df.to_csv('email_df_final.csv',index=False)

  and should_run_async(code)
