In [1]:
import numpy as np
import pandas as pd
import string
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from nltk.corpus import stopwords
from gensim import corpora, models, similarities
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('email_input.csv')

In [3]:
data.head()

Unnamed: 0,body,from,message-id,subject
0,"['Anyone', 'know', 'about', 'the', 'Weitek', '...",cavalier@blkbox.COM (Bill Egan),<1993Apr18.031714.3642@nntpxfer.psi.com>,Weitek P9000 ?
1,"['In', 'article', '1993Mar31', '140529', '1084...",arc@cco.caltech.edu (Aaron Ray Clements),<1pifnjINNscb@gap.caltech.edu>,ACLU (was Re
2,"['and', 'I', 'm', 'sure', 'that', 'people', 'w...",hrubin@pop.stat.purdue.edu (Herman Rubin),<C5sFnz.Fo1@mentor.cc.purdue.edu>,Gritz/JBS/Liberty Lobby/LaRouche/Christic Insi...
3,"['In', 'article', '1r6p8oINN8hi', 'clem', 'han...",wwarf@silver.ucs.indiana.edu (Wayne J. Warf),<C5y2r9.4D7@usenet.ucs.indiana.edu>,BD's did themselves--you're all paranoid freaks
4,"['I', 'recently', 'have', 'become', 'aware', '...",mon@cray.com (Muriel Nelson),<1993Apr15.154053.3087@hemlock.cray.com>,ABORTION and private health coverage -- letter...


LDA requires extra text-preprocessing. Custom functions are used to remove additional punctuation and words.
These functions are then used to clean the body of the email messages to improve the accuracy in calculating the probability score assigned to each word.

In [4]:
def split_and_clean(x):
    x = x.strip('[')
    x = x.strip(']')
    x = x.replace("'","")
    x = x.split(sep=',')
    return x

In [5]:
extra_words = []
def extra_cleaning(x):
    for n in range(len(x)):
        if len(x[n]) < 2:
            extra_words.append(x[n])
        x[n] = x[n].strip()
        x[n] = x[n].lower()
    return x

In [6]:
tokenized_text = data['body'].to_list()

In [7]:
print(f'Before Cleaning')
print(tokenized_text[0])

Before Cleaning
['Anyone', 'know', 'about', 'the', 'Weitek', 'P9000', 'graphics', 'chip', 'Do', 'you', 'have', 'Weitek', 's', 'address', 'phone', 'number', 'I', 'd', 'like', 'to', 'get', 'some', 'information', 'about', 'this', 'chip', 'Yes', 'I', 'am', 'very', 'interested', 'in', 'this', 'chip', 'Please', 'follow', 'up', 'or', 'email', 'Bill', 'EganCavalier', 'GraphicsHouston', 'Texas']


In [8]:
#CLEANING THE TOKENIZED TEXT USING CUSTOM FUNCTIONS
for n in range(len(tokenized_text)):
    tokenized_text[n] = extra_cleaning(split_and_clean(tokenized_text[n]))

In [9]:
print(f'After Cleaning')
print(tokenized_text[0])

After Cleaning
['anyone', 'know', 'about', 'the', 'weitek', 'p9000', 'graphics', 'chip', 'do', 'you', 'have', 'weitek', 's', 'address', 'phone', 'number', 'i', 'd', 'like', 'to', 'get', 'some', 'information', 'about', 'this', 'chip', 'yes', 'i', 'am', 'very', 'interested', 'in', 'this', 'chip', 'please', 'follow', 'up', 'or', 'email', 'bill', 'egancavalier', 'graphicshouston', 'texas']


In [10]:
stopwords = ENGLISH_STOP_WORDS.union(['like', 'know', 'think', 'just','don','make','does','way','com','thanks'])
add = ['0','1','2','3','4','5','6','7','8','9','_','yes','no']
letters = list(string.ascii_lowercase)
extra_words = extra_words + add + letters
stopwords = stopwords.union(extra_words)
print(f'Total number of stopwords removed from the dataset : {len(stopwords)}')

Total number of stopwords removed from the dataset : 367


In [11]:
#Preparing the texts for LDA
texts = [[word for word in text if word not in stopwords] for text in tokenized_text]
print(f'Cleaned Text Sample')
print(texts[0])

Cleaned Text Sample
['weitek', 'p9000', 'graphics', 'chip', 'weitek', 'address', 'phone', 'number', 'information', 'chip', 'interested', 'chip', 'follow', 'email', 'egancavalier', 'graphicshouston', 'texas']


Therefore, only the keywords from each email are extracted and stored in texts for 300 emails.

In [12]:
#CREATING A DICTIONARY USING GENSIM
dictionary = corpora.Dictionary(text for text in texts)

#CREATING UPPER BOUND AND LOWER BOUND TO EXCLUDE UNNECESSARY WORDS
dictionary.filter_extremes(no_below=1, no_above=0.8)

#CREATING A BAG OF WORDS MODEL
corpus = [dictionary.doc2bow(text) for text in texts]

In [13]:
#BUILD A LDA MODEL FROM GENSIM
LDA = models.LdaModel(corpus, num_topics=5, id2word=dictionary, update_every=5,chunksize=10000, passes=100)
print('LDA MODEL READY')

LDA MODEL READY


In [14]:
LDA.show_topics()

[(0,
  '0.004*"people" + 0.004*"government" + 0.003*"said" + 0.003*"want" + 0.003*"believe" + 0.003*"power" + 0.003*"pay" + 0.002*"police" + 0.002*"state" + 0.002*"league"'),
 (1,
  '0.005*"data" + 0.005*"image" + 0.005*"edu" + 0.003*"available" + 0.003*"ftp" + 0.002*"use" + 0.002*"graphics" + 0.002*"software" + 0.002*"package" + 0.002*"images"'),
 (2,
  '0.007*"people" + 0.006*"gun" + 0.005*"government" + 0.004*"edu" + 0.003*"right" + 0.003*"fbi" + 0.003*"guns" + 0.003*"batf" + 0.003*"time" + 0.002*"children"'),
 (3,
  '0.011*"jpeg" + 0.011*"image" + 0.005*"file" + 0.005*"gif" + 0.004*"images" + 0.004*"color" + 0.004*"format" + 0.003*"edu" + 0.003*"president" + 0.003*"software"'),
 (4,
  '0.003*"problem" + 0.003*"image" + 0.003*"otis" + 0.003*"gun" + 0.003*"lines" + 0.002*"homosexual" + 0.002*"files" + 0.002*"use" + 0.002*"pex" + 0.002*"edu"')]

In [15]:
#CREATE A NUMPY ARRAY OF TOPICS
topics = LDA.show_topics(formatted=False, num_words=20)
topics = np.array(topics)

cluster_id = 1
for n in topics[:,:]:
    print(f'Top Words for Cluster {cluster_id} with Probability Score')
    print([str(word) for word in n[1]])
    print('\n')
    cluster_id += 1

Top Words for Cluster 1 with Probability Score
["('people', 0.0038745147)", "('government', 0.0035155138)", "('said', 0.0029258046)", "('want', 0.0027021258)", "('believe', 0.0026552991)", "('power', 0.002619269)", "('pay', 0.0025032319)", "('police', 0.0022943616)", "('state', 0.0022623087)", "('league', 0.0022029597)", "('right', 0.0020094109)", "('fbi', 0.0019954387)", "('defamation', 0.0018912028)", "('cement', 0.0018912028)", "('did', 0.0018464912)", "('really', 0.0018237222)", "('anti', 0.0017875676)", "('information', 0.0017655236)", "('rights', 0.001753033)", "('use', 0.0016902918)"]


Top Words for Cluster 2 with Probability Score
["('data', 0.005244993)", "('image', 0.0046453793)", "('edu', 0.0045123477)", "('available', 0.0029852265)", "('ftp', 0.002595399)", "('use', 0.0023694346)", "('graphics', 0.002285223)", "('software', 0.0021593934)", "('package', 0.0020160535)", "('images', 0.00195763)", "('line', 0.0017359726)", "('insurance', 0.0016314664)", "('program', 0.00160290