In [27]:
import pandas as pd
import seaborn as sns

sns.set() 

import logging
import warnings

# Gensim
import gensim
import gensim.corpora as corpora

# Display setting to show more characters in column
pd.set_option('display.max_columns', 500)

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', 'com', 'http', 'mail', 'pm'])

import pickle
import pyLDAvis
import pyLDAvis.gensim_models

### Load data

In [28]:
data_ready = pickle.load( open( "data/clean_words_sample.pickle", "rb" ) )

### LDA

In [29]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=5,
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=60,
    alpha="auto",
    iterations=100,
    per_word_topics=True,
)

print(lda_model.print_topics())


[(0, '0.010*"amazon" + 0.009*"schedule" + 0.006*"message" + 0.006*"feedback" + 0.005*"order" + 0.005*"list" + 0.005*"offer" + 0.005*"final" + 0.005*"click" + 0.005*"training"'), (1, '0.006*"rule" + 0.005*"standard" + 0.005*"gisb" + 0.005*"transaction" + 0.005*"member" + 0.005*"service" + 0.004*"change" + 0.004*"rto" + 0.004*"reliability" + 0.004*"program"'), (2, '0.015*"customer" + 0.011*"market" + 0.011*"margin" + 0.011*"credit" + 0.009*"letter" + 0.009*"day" + 0.008*"value" + 0.008*"issue" + 0.007*"price" + 0.006*"security"'), (3, '0.014*"stock" + 0.012*"company" + 0.009*"market" + 0.009*"year" + 0.008*"price" + 0.006*"energy" + 0.006*"percent" + 0.005*"share" + 0.005*"investor" + 0.005*"business"'), (4, '0.127*"font" + 0.074*"size" + 0.039*"align" + 0.026*"br" + 0.023*"nbsp" + 0.021*"tr" + 0.010*"name" + 0.009*"venturewire" + 0.007*"gif_border" + 0.007*"color"')]


In [30]:
pickle.dump(lda_model, open("data/lda_model_sample.pickle", "wb"))

# Topic


In [60]:
data = pickle.load( open( "data/data_sample.pickle", "rb" ) )
lda_model = pickle.load( open( "data/lda_model_sample.pickle", "rb" ) )


In [62]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()

    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]),
                    ignore_index=True,
                )
            else:
                break
    sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df


In [63]:
df_topic_sents_keywords = format_topics_sentences(
    ldamodel=lda_model, corpus=corpus, texts=data_ready
)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = [
    "Document_No",
    "Dominant_Topic",
    "Topic_Perc_Contrib",
    "Keywords",
    "Text",
]
df_dominant_topic.head(2)


  sent_topics_df = sent_topics_df.append(
  sent_topics_df = sent_topics_df.append(


Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,4,0.9999,"font, size, align, br, nbsp, tr, name, venture...","[title, gas, index, title, body, align, center..."
1,1,0,0.9973,"amazon, schedule, message, feedback, order, li...","[start, date, hourahead, hour, ancillary, sche..."


In [64]:
email_df = pd.read_csv("data/sample.csv")
email_df.head(2)

Unnamed: 0,id_mail,date,from,to,subject,body,x_origin
0,<11784386.1075862626536.JavaMail.evans@thyme>,2001-11-19 10:00:05-08:00,feedback@intcx.com,gasindex@list.intcx.com,Gas Indices,\n ...,Bass-E
1,<23529076.1075841026944.JavaMail.evans@thyme>,2001-04-07 02:41:00-07:00,pete.davis@enron.com,pete.davis@enron.com,Start Date: 4/7/01; HourAhead hour: 10; <CODE...,Start Date: 4/7/01; HourAhead hour: 10; No an...,LINDER-E


In [65]:
email_df.head(1)

Unnamed: 0,id_mail,date,from,to,subject,body,x_origin
0,<11784386.1075862626536.JavaMail.evans@thyme>,2001-11-19 10:00:05-08:00,feedback@intcx.com,gasindex@list.intcx.com,Gas Indices,\n ...,Bass-E


In [66]:
email_df.shape

(100, 7)

In [67]:
df_dominant_topic.shape

(100, 5)

In [68]:
test = pd.concat([email_df, df_dominant_topic], axis=1)

In [70]:
test.to_csv("data/streamlit_df.csv", index=False)

In [18]:
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby("Dominant_Topic")

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat(
        [
            sent_topics_sorteddf_mallet,
            grp.sort_values(["Perc_Contribution"], ascending=False).head(1),
        ],
        axis=0,
    )

# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = [
    "Topic_Num",
    "Topic_Perc_Contrib",
    "Keywords",
    "Representative Text",
]

# Show
sent_topics_sorteddf_mallet.head()


Unnamed: 0,Topic_Num,Topic_Perc_Contrib,Keywords,Representative Text
0,0,0.9998,"way, request, meeting, schedule, fare, call, c...","[click_save, update, freedom, travel, week, in..."
1,1,0.9992,"deal, sell, send, information, find, time, fol...","[shawn, donlin, fw, performance, confirm, emai..."
2,2,0.9994,"xms, time, price, send, conference, report, va...","[freak, gas, price, kill, way, high, price, pr..."
3,3,0.9999,"session, week, end, order, contract, trading, ...","[member, member, firm, executive, session, rul..."
4,4,0.9999,"account, time, financial, access, check, inves...","[edition, banking, essential, investment, acco..."


### combine df

In [19]:
email_df = pd.read_csv("data/sample.csv")
email_df.head(1)
# sent_topics_sorteddf_mallet

Unnamed: 0.1,Unnamed: 0,id_mail,date,from,to,subject,body,x_origin
0,387118,<20453266.1075858558875.JavaMail.evans@thyme>,2001-09-04 07:36:53-07:00,ryan.ruppert@exxonmobil.com,gerald.nemec@enron.com,Sup?,"G,\n\nHow was the trip to Longhorn Country? H...",Nemec-G


### VIZ

In [None]:

pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(f"test"))

if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(f"test") +'.html')
LDAvis_prepared