In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

sns.set() 

import logging
import re
import warnings

# Gensim
import gensim
import gensim.corpora as corpora
import matplotlib.pyplot as plt
import spacy
from dateutil import parser
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess

# Display setting to show more characters in column
pd.set_option('display.max_columns', 500)

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come', 'com', 'http', 'mail', 'pm'])

import pickle
import pyLDAvis
import pyLDAvis.gensim_models

import pickle

### Load data

In [None]:
df = pd.read_csv(r"data/raw_mail_all.csv").sample(5000)


### Cleaning dataset


date format


In [None]:
df["date"] = pd.to_datetime(df["date"], infer_datetime_format=True)


bag of words

In [None]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub("\S*@\S*\s?", "", sent)  # remove emails
        sent = re.sub("\s+", " ", sent)  # remove newline chars
        sent = re.sub("'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True)
        yield (sent)


data = df.body.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])


In [None]:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
def process_words(
    texts, stop_words=stop_words, allowed_postags=["NOUN", "ADJ", "VERB"]
):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts
    ]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
    for sent in texts:
        doc = nlp(" ".join(sent))
        texts_out.append(
            [token.lemma_ for token in doc if token.pos_ in allowed_postags]
        )
    # remove stopwords once more after lemmatization
    texts_out = [
        [word for word in simple_preprocess(str(doc)) if word not in stop_words]
        for doc in texts_out
    ]
    return texts_out


data_ready = process_words(data_words)


In [None]:
print(data_ready[100])


In [None]:
pickle.dump(data_ready, open( "clean_words.pickle", "wb" ))

In [None]:
data_ready = pickle.load( open( "clean_words.pickle", "rb" ) )

### LDA

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=5,
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=60,
    alpha="auto",
    iterations=100,
    per_word_topics=True,
)

print(lda_model.print_topics())


# Topic


In [None]:
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    sent_topics_df = pd.DataFrame()

    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(
                    pd.Series([int(topic_num), round(prop_topic, 4), topic_keywords]),
                    ignore_index=True,
                )
            else:
                break
    sent_topics_df.columns = ["Dominant_Topic", "Perc_Contribution", "Topic_Keywords"]

    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return sent_topics_df


In [None]:
df_topic_sents_keywords = format_topics_sentences(
    ldamodel=lda_model, corpus=corpus, texts=data_ready
)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = [
    "Document_No",
    "Dominant_Topic",
    "Topic_Perc_Contrib",
    "Keywords",
    "Text",
]
df_dominant_topic.head(10)


In [None]:
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby("Dominant_Topic")

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat(
        [
            sent_topics_sorteddf_mallet,
            grp.sort_values(["Perc_Contribution"], ascending=False).head(1),
        ],
        axis=0,
    )

# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)

# Format
sent_topics_sorteddf_mallet.columns = [
    "Topic_Num",
    "Topic_Perc_Contrib",
    "Keywords",
    "Representative Text",
]

# Show
sent_topics_sorteddf_mallet.head()


In [None]:

pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./results/ldavis_prepared_'+str(f"test"))

if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)

with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

pyLDAvis.save_html(LDAvis_prepared, './results/ldavis_prepared_'+ str(f"test") +'.html')
LDAvis_prepared

# Cluster


### Top modeling visualization


In [None]:
doc_lens = [len(d) for d in df_dominant_topic.Text]

# Plot
plt.figure(figsize=(16, 7), dpi=160)
plt.hist(doc_lens, bins=1000, color="navy")
plt.text(750, 100, "Mean   : " + str(round(np.mean(doc_lens))))
plt.text(750, 90, "Median : " + str(round(np.median(doc_lens))))
plt.text(750, 80, "Stdev   : " + str(round(np.std(doc_lens))))
plt.text(750, 70, "1%ile    : " + str(round(np.quantile(doc_lens, q=0.01))))
plt.text(750, 60, "99%ile  : " + str(round(np.quantile(doc_lens, q=0.99))))

plt.gca().set(
    xlim=(0, 1000), ylabel="Number of Documents", xlabel="Document Word Count"
)
plt.tick_params(size=16)
plt.xticks(np.linspace(0, 1000, 9))
plt.title("Distribution of Document Word Counts", fontdict=dict(size=22))
plt.show()


In [None]:
import seaborn as sns
import matplotlib.colors as mcolors

cols = [
    color for name, color in mcolors.TABLEAU_COLORS.items()
]  # more colors: 'mcolors.XKCD_COLORS'

fig, axes = plt.subplots(2, 2, figsize=(16, 14), dpi=160, sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    df_dominant_topic_sub = df_dominant_topic.loc[
        df_dominant_topic.Dominant_Topic == i, :
    ]
    doc_lens = [len(d) for d in df_dominant_topic_sub.Text]
    ax.hist(doc_lens, bins=1000, color=cols[i])
    ax.tick_params(axis="y", labelcolor=cols[i], color=cols[i])
    sns.kdeplot(doc_lens, color="black", shade=False, ax=ax.twinx())
    ax.set(xlim=(0, 1000), xlabel="Document Word Count")
    ax.set_ylabel("Number of Documents", color=cols[i])
    ax.set_title("Topic: " + str(i), fontdict=dict(size=16, color=cols[i]))

fig.tight_layout()
fig.subplots_adjust(top=0.90)
plt.xticks(np.linspace(0, 1000, 9))
fig.suptitle("Distribution of Document Word Counts by Dominant Topic", fontsize=22)
plt.show()


In [None]:
# 1. Wordcloud of Top N words in each topic
from matplotlib import pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import matplotlib.colors as mcolors

cols = [
    color for name, color in mcolors.TABLEAU_COLORS.items()
]  # more colors: 'mcolors.XKCD_COLORS'

cloud = WordCloud(
    stopwords=stop_words,
    background_color="white",
    width=2500,
    height=1800,
    max_words=10,
    colormap="tab10",
    color_func=lambda *args, **kwargs: cols[i],
    prefer_horizontal=1.0,
)

topics = lda_model.show_topics(formatted=False)

fig, axes = plt.subplots(2, 2, figsize=(10, 10), sharex=True, sharey=True)

for i, ax in enumerate(axes.flatten()):
    fig.add_subplot(ax)
    topic_words = dict(topics[i][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=300)
    plt.gca().imshow(cloud)
    plt.gca().set_title("Topic " + str(i), fontdict=dict(size=16))
    plt.gca().axis("off")


plt.subplots_adjust(wspace=0, hspace=0)
plt.axis("off")
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()


In [None]:
from collections import Counter

topics = lda_model.show_topics(formatted=False)
data_flat = [w for w_list in data_ready for w in w_list]
counter = Counter(data_flat)

out = []
for i, topic in topics:
    for word, weight in topic:
        out.append([word, i, weight, counter[word]])

df = pd.DataFrame(out, columns=["word", "topic_id", "importance", "word_count"])

# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(2, 2, figsize=(16, 10), sharey=True, dpi=160)
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
for i, ax in enumerate(axes.flatten()):
    ax.bar(
        x="word",
        height="word_count",
        data=df.loc[df.topic_id == i, :],
        color=cols[i],
        width=0.5,
        alpha=0.3,
        label="Word Count",
    )
    ax_twin = ax.twinx()
    ax_twin.bar(
        x="word",
        height="importance",
        data=df.loc[df.topic_id == i, :],
        color=cols[i],
        width=0.2,
        label="Weights",
    )
    ax.set_ylabel("Word Count", color=cols[i])
    ax_twin.set_ylim(0, 0.030)
    ax.set_ylim(0, 3500)
    ax.set_title("Topic: " + str(i), color=cols[i], fontsize=16)
    ax.tick_params(axis="y", left=False)
    ax.set_xticklabels(
        df.loc[df.topic_id == i, "word"], rotation=30, horizontalalignment="right"
    )
    ax.legend(loc="upper left")
    ax_twin.legend(loc="upper right")

fig.tight_layout(w_pad=2)
fig.suptitle("Word Count and Importance of Topic Keywords", fontsize=22, y=1.05)
plt.show()


In [None]:
# Sentence Coloring of N Sentences
from matplotlib.patches import Rectangle


def sentences_chart(lda_model=lda_model, corpus=corpus, start=0, end=13):
    corp = corpus[start:end]
    mycolors = [color for name, color in mcolors.TABLEAU_COLORS.items()]

    fig, axes = plt.subplots(
        end - start, 1, figsize=(20, (end - start) * 0.95), dpi=160
    )
    axes[0].axis("off")
    for i, ax in enumerate(axes):
        if i > 0:
            corp_cur = corp[i - 1]
            topic_percs, wordid_topics, wordid_phivalues = lda_model[corp_cur]
            word_dominanttopic = [
                (lda_model.id2word[wd], topic[0]) for wd, topic in wordid_topics
            ]
            ax.text(
                0.01,
                0.5,
                "Doc " + str(i - 1) + ": ",
                verticalalignment="center",
                fontsize=16,
                color="black",
                transform=ax.transAxes,
                fontweight=700,
            )

            # Draw Rectange
            topic_percs_sorted = sorted(topic_percs, key=lambda x: (x[1]), reverse=True)
            ax.add_patch(
                Rectangle(
                    (0.0, 0.05),
                    0.99,
                    0.90,
                    fill=None,
                    alpha=1,
                    color=mycolors[topic_percs_sorted[0][0]],
                    linewidth=2,
                )
            )

            word_pos = 0.06
            for j, (word, topics) in enumerate(word_dominanttopic):
                if j < 14:
                    ax.text(
                        word_pos,
                        0.5,
                        word,
                        horizontalalignment="left",
                        verticalalignment="center",
                        fontsize=16,
                        color=mycolors[topics],
                        transform=ax.transAxes,
                        fontweight=700,
                    )
                    word_pos += 0.009 * len(word)  # to move the word for the next iter
                    ax.axis("off")
            ax.text(
                word_pos,
                0.5,
                ". . .",
                horizontalalignment="left",
                verticalalignment="center",
                fontsize=16,
                color="black",
                transform=ax.transAxes,
            )

    plt.subplots_adjust(wspace=0, hspace=0)
    plt.suptitle(
        "Sentence Topic Coloring for Documents: " + str(start) + " to " + str(end - 2),
        fontsize=22,
        y=0.95,
        fontweight=700,
    )
    plt.tight_layout()
    plt.show()


sentences_chart()


### Impliment K Means Cluster Algorithm


In [None]:
true_k = 5
model = KMeans(n_clusters=true_k, init="k-means++", max_iter=300, n_init=1, verbose=1)
model.fit(X_df)


In [None]:
from joblib import dump, load

dump(model, "model.joblib")


In [None]:
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = cv.get_feature_names()


In [None]:
order_centroids


In [None]:
model.cluster_centers_.argsort()


### Feature Words Per Cluster

https://github.com/adriancampos1/Enron_Email_Analysis/blob/master/Enron_Email_Analysis_K-means_clustering.ipynb


In [None]:
for i in range(true_k):
    print("Cluster d:", i),
    for ind in order_centroids[i, :20]:
        print(" ", terms[ind])


### Run Test Document & Find it's Cluster


In [None]:
# confirming we still have our 'body' document
test_document = df["body"][0]
test_document


In [None]:
test_X = cv.transform([test_document])
test_X_dense = test_X.todense()


In [None]:
text_X_df = pd.DataFrame(test_X_dense, columns=cv.get_feature_names())
# creating new dense to correct for chopped of features earlier


In [None]:
text_X_df = text_X_df.iloc[:, 106:]


In [None]:
text_X_df.head(5)


In [None]:
print("Prediction")
predicted = model.predict(text_X_df)
print(predicted)


### Plot cluster


In [None]:
model.cluster_centers_.shape


In [None]:
centers = model.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c="black", s=200, alpha=0.5)


In [None]:
pca = PCA(n_components=2)
coords = pca.fit_transform(X_df)

centers = model.cluster_centers_
pca_centers = pca.transform(centers)

plt.scatter(coords[:, 0], coords[:, 1], c="m")
plt.scatter(pca_centers[:, 0], pca_centers[:, 1], c="red", s=200, alpha=0.5)
plt.show()


In [None]:
pca.transform(model.cluster_centers_)
