In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
from collections import defaultdict
import string

from gensim.models import LdaMulticore
from gensim import corpora, models

import nltk
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

import re

rcParams['figure.figsize'] = 12, 18



First we load our dataset

In [2]:
emails = pd.read_csv("hillary-clinton-emails/Emails.csv")

In [3]:
emails.columns.values

array(['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'MetadataFrom',
       'SenderPersonId', 'MetadataDateSent', 'MetadataDateReleased',
       'MetadataPdfLink', 'MetadataCaseNumber', 'MetadataDocumentClass',
       'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom', 'ExtractedCc',
       'ExtractedDateSent', 'ExtractedCaseNumber', 'ExtractedDocNumber',
       'ExtractedDateReleased', 'ExtractedReleaseInPartOrFull',
       'ExtractedBodyText', 'RawText'], dtype=object)

In this part we aggregate over ExtractedSubject which represents the topics of happen conversations

In [7]:
emailsDocuments = emails[['ExtractedSubject','ExtractedBodyText']].fillna('nan').copy()

We remplace all non-alphabetical characters with a whitespace. Group conversations by subject. After we join all texts in one group in one text

In [14]:
emailsDocuments.ExtractedSubject = emailsDocuments.ExtractedSubject.apply(lambda x: re.sub('[^a-zA-Z\-]+', ' ', x))
emailsDocuments.ExtractedBodyText = emailsDocuments.ExtractedBodyText.apply(lambda x: re.sub('[^a-zA-Z\-]+', ' ', x))

In [15]:
groupedemails = emailsDocuments.groupby('ExtractedSubject')['ExtractedBodyText'].apply(lambda x: ' '.join(x)).reset_index()

In [16]:
groupedemails['Documents'] = groupedemails.ExtractedSubject.astype(str).str.cat(groupedemails.ExtractedBodyText.astype(str), sep=' ')

In [17]:
# groupedemails['Documents'] = groupedemails.RawText  ExtractedBodyText

Next thing will be to implement the pipline to clean our texts and remove email oriented words 

In [18]:
customstopwords = stopwords.words('english')
customstopwords = customstopwords + ['nan','call', 'know', 'would', 'get', 'time', 'work', 'like', 'today', 
                                     'see', 'morning', 'also', 'back', 'tomorrow', 'meeting', 'think', 
                                     'good', 'want', 'could', 'working', 'well', 'fw', 'fyi', 're', 'pm', 'h',
                                     'subject',
                                     'a','about','above','across','after','again','against','all','almost',
                                     'alone','along','already','also','although','always','among','an','and',
                                     'another','any','anybody','anyone','anything','anywhere','are','area',
                                     'areas','around','as','ask','asked','asking','asks','at','away','b','back',
                                     'backed','backing','backs','be','became','because','become','becomes','been',
                                     'before','began','behind','being','beings','best','better','between','big',
                                     'both','but','by','c','came','can','cannot','case','cases','certain',
                                     'certainly','clear','clearly','come','could','d','did','differ','different',
                                     'differently','do','does','done','down','down','downed','downing','downs',
                                     'during','e','each','early','either','end','ended','ending','ends','enough',
                                     'even','evenly','ever','every','everybody','everyone','everything',
                                     'everywhere','f','face','faces','fact','facts','far','felt','few','find',
                                     'finds','first','for','four','from','full','fully','further','furthered',
                                     'furthering','furthers','g','gave','general','generally','get','gets','give',
                                     'given','gives','go','going','good','goods','got','great','greater','greatest',
                                     'group','grouped','grouping','groups','h','had','has','have','having','he','her',
                                     'here','herself','high','high','high','higher','highest','him','himself','his',
                                     'how','however','i','if','important','in','interest','interested','interesting',
                                     'interests','into','is','it','its','itself','j','just','k','keep','keeps','kind',
                                     'knew','know','known','knows','l','large','largely','last','later','latest',
                                     'least','less','let','lets','like','likely','long','longer','longest','m',
                                     'made','make','making','man','many','may','me','member','members','men',
                                     'might','more','most','mostly','mr','mrs','much','must','my','myself',
                                     'n','necessary','need','needed','needing','needs','never','new','new',
                                     'newer','newest','next','no','nobody','non','noone','not','nothing',
                                     'now','nowhere','number','numbers','o','of','off','often','old','older',
                                     'oldest','on','once','one','only','open','opened','opening','opens','or',
                                     'order','ordered','ordering','orders','other','others','our','out','over',
                                     'p','part','parted','parting','parts','per','perhaps','place','places',
                                     'point','pointed','pointing','points','possible','present','presented',
                                     'presenting','presents','problem','problems','put','puts','q','quite','r',
                                     'rather','really','right','right','room','rooms','s','said','same','saw',
                                     'say','says','second','seconds','see','seem','seemed','seeming','seems',
                                     'sees','several','shall','she','should','show','showed','showing','shows',
                                     'side','sides','since','small','smaller','smallest','so','some','somebody',
                                     'someone','something','somewhere','state','states','still','still','such',
                                     'sure','t','take','taken','than','that','the','their','them','then','there',
                                     'therefore','these','they','thing','things','think','thinks','this','those',
                                     'though','thought','thoughts','three','through','thus','to','today','together',
                                     'too','took','toward','turn','turned','turning','turns','two','u','under',
                                     'until','up','upon','us','use','used','uses','v','very','w','want',
                                     'wanted','wanting','wants','was','way','ways','we','well','wells','went','were',
                                     'what','when','where','whether','which','while','who','whole','whose','why',
                                     'will','with','within','without','work','worked','working','works','would','x',
                                     'y','year','years','yet','you','young','younger','youngest','your','yours','z']
def format(text):
    words = [w.lower() for w in text.split() if len(w)>3]
    wordswostop = [w for w in words if w not in customstopwords]
    return wordswostop

Do preparation of a corpora for genism LDA.

In [19]:
texts = [format(doc) for doc in groupedemails.Documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In the series of experiments the best results were shown around 20 topics

In [20]:
N = 20
lda = LdaMulticore(corpus, num_topics=N, workers=3, id2word=dictionary, alpha=0.02, eta=0.02, 
                   iterations=1000, random_state=42)
lda.print_topics(N)

[(0,
  '0.006*"obama" + 0.004*"president" + 0.004*"party" + 0.003*"political" + 0.003*"house" + 0.003*"government" + 0.002*"clinton" + 0.002*"people" + 0.002*"support" + 0.002*"world"'),
 (1,
  '0.005*"monday" + 0.004*"pakistan" + 0.004*"speech" + 0.003*"secretary" + 0.003*"send" + 0.003*"reid" + 0.003*"president" + 0.003*"message" + 0.003*"dinner" + 0.003*"spoke"'),
 (2,
  '0.006*"secretary" + 0.005*"talk" + 0.004*"clinton" + 0.004*"speech" + 0.004*"haiti" + 0.003*"release" + 0.003*"people" + 0.003*"home" + 0.003*"send" + 0.003*"ashton"'),
 (3,
  '0.008*"israel" + 0.004*"iran" + 0.004*"obama" + 0.004*"american" + 0.004*"foreign" + 0.004*"political" + 0.004*"peace" + 0.004*"print" + 0.003*"security" + 0.003*"president"'),
 (4,
  '0.007*"secretary" + 0.005*"clinton" + 0.005*"press" + 0.004*"office" + 0.003*"speech" + 0.003*"huma" + 0.003*"president" + 0.003*"strategic" + 0.003*"people" + 0.003*"lona"'),
 (5,
  '0.005*"libya" + 0.004*"united" + 0.004*"president" + 0.004*"department" + 0.

Looks fine but there are some duplicates  0,6,18,19 and 4 represents the same topic related to elections.
5,7,10 topics around Libya and 3,9,14,17 conversations related to Israel.

In [21]:
N = 11
lda = LdaMulticore(corpus, num_topics=N, workers=3, id2word=dictionary, alpha=0.02, eta=0.02, 
                   iterations=1000, random_state=42)
lda.print_topics(N)

[(0,
  '0.005*"obama" + 0.004*"president" + 0.003*"party" + 0.003*"political" + 0.003*"house" + 0.003*"government" + 0.003*"secretary" + 0.002*"people" + 0.002*"called" + 0.002*"office"'),
 (1,
  '0.004*"monday" + 0.004*"secretary" + 0.003*"speech" + 0.003*"nuclear" + 0.003*"clinton" + 0.003*"issues" + 0.003*"washington" + 0.003*"message" + 0.003*"print" + 0.003*"reid"'),
 (2,
  '0.005*"send" + 0.004*"secretary" + 0.004*"haiti" + 0.003*"release" + 0.003*"clinton" + 0.003*"schedule" + 0.003*"statement" + 0.003*"message" + 0.003*"cheryl" + 0.003*"talk"'),
 (3,
  '0.007*"israel" + 0.004*"print" + 0.004*"american" + 0.004*"political" + 0.004*"obama" + 0.003*"foreign" + 0.003*"president" + 0.003*"peace" + 0.003*"speech" + 0.003*"iran"'),
 (4,
  '0.007*"secretary" + 0.004*"clinton" + 0.004*"office" + 0.004*"press" + 0.004*"speech" + 0.004*"president" + 0.003*"lona" + 0.003*"people" + 0.003*"assistant" + 0.003*"cheryl"'),
 (5,
  '0.004*"libya" + 0.004*"united" + 0.003*"office" + 0.003*"depart

Does not look better after reduction as lost topic about China...

In [29]:
for N in range(2,51):
    lda = LdaMulticore(corpus, num_topics=N, workers=3, id2word=dictionary, alpha=0.02, eta=0.02, 
                   iterations=1000, random_state=42)
    print('Number of modeled topics {} and perplexity for the corpora {}'.format(N, lda.log_perplexity(corpus)))

Number of modeled topics 2 and perplexity for the corpora -9.271516150431834
Number of modeled topics 3 and perplexity for the corpora -9.404023712125204
Number of modeled topics 4 and perplexity for the corpora -9.536958166659696
Number of modeled topics 5 and perplexity for the corpora -9.646190592954412
Number of modeled topics 6 and perplexity for the corpora -9.738715685847344
Number of modeled topics 7 and perplexity for the corpora -9.694275673091616
Number of modeled topics 8 and perplexity for the corpora -9.76478951162569
Number of modeled topics 9 and perplexity for the corpora -9.803833055632605
Number of modeled topics 10 and perplexity for the corpora -9.871231882430763
Number of modeled topics 11 and perplexity for the corpora -9.924778216326255
Number of modeled topics 12 and perplexity for the corpora -9.946776065371559
Number of modeled topics 13 and perplexity for the corpora -10.002614743762406
Number of modeled topics 14 and perplexity for the corpora -10.032282207

Unfortunately we don't have a specific heldout cropora to evaluate our results properly. From the observed results we can see the that potentialy 6-7 topics can be a good fit as of a slight decrese of 7 between scores for 6 and 8 topics.