In [219]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sns
from collections import defaultdict
import string

from gensim.models import LdaMulticore
from gensim import corpora, models

import nltk
from nltk import sent_tokenize, word_tokenize, PorterStemmer
from nltk.corpus import stopwords

import re

rcParams['figure.figsize'] = 12, 18

First we load our dataset

In [2]:
emails = pd.read_csv("hillary-clinton-emails/Emails.csv")

In [8]:
emails.columns.values

array(['Id', 'DocNumber', 'MetadataSubject', 'MetadataTo', 'MetadataFrom',
       'SenderPersonId', 'MetadataDateSent', 'MetadataDateReleased',
       'MetadataPdfLink', 'MetadataCaseNumber', 'MetadataDocumentClass',
       'ExtractedSubject', 'ExtractedTo', 'ExtractedFrom', 'ExtractedCc',
       'ExtractedDateSent', 'ExtractedCaseNumber', 'ExtractedDocNumber',
       'ExtractedDateReleased', 'ExtractedReleaseInPartOrFull',
       'ExtractedBodyText', 'RawText'], dtype=object)

In this part we aggregate over ExtractedSubject which represents the topics of happen conversations

In [216]:
emailsDocuments = emails[['ExtractedSubject','ExtractedBodyText']].dropna().copy()

We remplace all non-alphabetical characters with a whitespace

In [250]:
emailsDocuments.ExtractedSubject = emailsDocuments.ExtractedSubject.apply(lambda x: re.sub('[^a-zA-Z\-]+', ' ', x))
emailsDocuments.ExtractedBodyText = emailsDocuments.ExtractedBodyText.apply(lambda x: re.sub('[^a-zA-Z\-]+', ' ', x))

In [251]:
groupedemails = emailsDocuments.groupby('ExtractedSubject')['ExtractedBodyText'].apply(lambda x: ' '.join(x)).reset_index()

In [252]:
groupedemails['Documents'] = groupedemails.ExtractedSubject.astype(str).str.cat(groupedemails.ExtractedBodyText.astype(str), sep=' ')

In [253]:
# groupedemails['Documents'] = groupedemails.RawText  ExtractedBodyText

Next thing will be to implement the pipline to clean our texts and remove email oriented words 

In [261]:
customstopwords = stopwords.words('english')
customstopwords = customstopwords + ['call', 'know', 'would', 'get', 'time', 'work', 'like', 'today', 
                                     'see', 'morning', 'also', 'back', 'tomorrow', 'meeting', 'think', 
                                     'good', 'want', 'could', 'working', 'well', 'fw', 'fyi', 're', 'pm', 'h',
                                     'subject',
                                     'a','about','above','across','after','again','against','all','almost',
                                     'alone','along','already','also','although','always','among','an','and',
                                     'another','any','anybody','anyone','anything','anywhere','are','area',
                                     'areas','around','as','ask','asked','asking','asks','at','away','b','back',
                                     'backed','backing','backs','be','became','because','become','becomes','been',
                                     'before','began','behind','being','beings','best','better','between','big',
                                     'both','but','by','c','came','can','cannot','case','cases','certain',
                                     'certainly','clear','clearly','come','could','d','did','differ','different',
                                     'differently','do','does','done','down','down','downed','downing','downs',
                                     'during','e','each','early','either','end','ended','ending','ends','enough',
                                     'even','evenly','ever','every','everybody','everyone','everything',
                                     'everywhere','f','face','faces','fact','facts','far','felt','few','find',
                                     'finds','first','for','four','from','full','fully','further','furthered',
                                     'furthering','furthers','g','gave','general','generally','get','gets','give',
                                     'given','gives','go','going','good','goods','got','great','greater','greatest',
                                     'group','grouped','grouping','groups','h','had','has','have','having','he','her',
                                     'here','herself','high','high','high','higher','highest','him','himself','his',
                                     'how','however','i','if','important','in','interest','interested','interesting',
                                     'interests','into','is','it','its','itself','j','just','k','keep','keeps','kind',
                                     'knew','know','known','knows','l','large','largely','last','later','latest',
                                     'least','less','let','lets','like','likely','long','longer','longest','m',
                                     'made','make','making','man','many','may','me','member','members','men',
                                     'might','more','most','mostly','mr','mrs','much','must','my','myself',
                                     'n','necessary','need','needed','needing','needs','never','new','new',
                                     'newer','newest','next','no','nobody','non','noone','not','nothing',
                                     'now','nowhere','number','numbers','o','of','off','often','old','older',
                                     'oldest','on','once','one','only','open','opened','opening','opens','or',
                                     'order','ordered','ordering','orders','other','others','our','out','over',
                                     'p','part','parted','parting','parts','per','perhaps','place','places',
                                     'point','pointed','pointing','points','possible','present','presented',
                                     'presenting','presents','problem','problems','put','puts','q','quite','r',
                                     'rather','really','right','right','room','rooms','s','said','same','saw',
                                     'say','says','second','seconds','see','seem','seemed','seeming','seems',
                                     'sees','several','shall','she','should','show','showed','showing','shows',
                                     'side','sides','since','small','smaller','smallest','so','some','somebody',
                                     'someone','something','somewhere','state','states','still','still','such',
                                     'sure','t','take','taken','than','that','the','their','them','then','there',
                                     'therefore','these','they','thing','things','think','thinks','this','those',
                                     'though','thought','thoughts','three','through','thus','to','today','together',
                                     'too','took','toward','turn','turned','turning','turns','two','u','under',
                                     'until','up','upon','us','use','used','uses','v','very','w','want',
                                     'wanted','wanting','wants','was','way','ways','we','well','wells','went','were',
                                     'what','when','where','whether','which','while','who','whole','whose','why',
                                     'will','with','within','without','work','worked','working','works','would','x',
                                     'y','year','years','yet','you','young','younger','youngest','your','yours','z']
def format(text):
    words = [w.lower() for w in text.split() if len(w)>3]
    wordswostop = [w for w in words if w not in customstopwords]
    return wordswostop

In [262]:
texts = [format(doc) for doc in groupedemails.Documents]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [264]:
N = 20
lda = LdaMulticore(corpus, num_topics=N, workers=3, id2word=dictionary, alpha=0.02, eta=0.02, 
                   iterations=1000, random_state=42)
lda.print_topics(N)

[(0,
  '0.010*"benghazi" + 0.009*"information" + 0.009*"house" + 0.009*"department" + 0.008*"date" + 0.008*"agreement" + 0.008*"comm" + 0.008*"dept" + 0.008*"waiver" + 0.007*"produced"'),
 (1,
  '0.009*"china" + 0.007*"district" + 0.005*"iran" + 0.004*"secretary" + 0.004*"holbrooke" + 0.004*"women" + 0.003*"house" + 0.003*"ambassador" + 0.003*"world" + 0.003*"united"'),
 (2,
  '0.007*"house" + 0.004*"heyman" + 0.004*"people" + 0.003*"policy" + 0.003*"book" + 0.003*"called" + 0.003*"richards" + 0.003*"bill" + 0.003*"support" + 0.003*"vote"'),
 (3,
  '0.007*"president" + 0.005*"office" + 0.004*"obama" + 0.004*"speech" + 0.004*"discuss" + 0.003*"washington" + 0.003*"percent" + 0.003*"please" + 0.003*"public" + 0.003*"print"'),
 (4,
  '0.008*"mail" + 0.006*"message" + 0.006*"received" + 0.005*"read" + 0.005*"clinton" + 0.005*"secretary" + 0.004*"start" + 0.004*"address" + 0.004*"local" + 0.003*"system"'),
 (5,
  '0.008*"secretary" + 0.005*"people" + 0.005*"house" + 0.005*"president" + 0.00