In [1]:
import sys, os, re, operator
import nltk
from Textrank import textrank
from Textrank.Units import EmailUnit, SentenceUnit
from util.Timer import Timer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.cluster import spectral_clustering, AffinityPropagation, MeanShift
# from sklearn.decomposition import PCA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [2]:
# Loading Sentence Detector and Stop Words List
sentenceDetector = nltk.data.load('tokenizers/punkt/english.pickle')
stopwords = stopwords.words('english')

In [3]:
# Yields all non-hidden files/subdirectories in a directory.
def listdir_nohidden(path):
    for f in os.listdir(path):
        if not f.startswith('.'):
            yield '{0}/{1}'.format(path, f)

In [7]:
# Takes in an emails file path and the account owner and returns the EmailUnit.
def parseEmail(path, owner):
    
    with open(path, 'r') as f:
        contents = [l for l in f]
    
    # Lines that match these regexes should be removed:
    removeRegex = "({0})".format("|".join(['----------------------\sForwarded\sby',
                                            '-----Original\sMessage-----',
                                            'From:\s',
                                            'Sent:\s',
                                            'To:\s',
                                            'cc:\s',
                                            'bcc:\s',
                                            '[IMAGE]',
                                            'Subject:\s',
                                           ]))
        
    # Find the line numbers of the corresponding fields:
    senderIndex = [i for i, l in enumerate(contents) if re.search('(From:)', l)][0]
    recipientIndex = [i for i, l in enumerate(contents) if re.search('(X-To:)', l)][0]
    subjectIndex = [i for i, l in enumerate(contents) if re.search('(Subject:)', l)][0]
    textIndex = [i for i, l in enumerate(contents) if re.search('(X-FileName:)', l)][0] + 2
    
    # Extract Email metadata:
    sender = re.search('(From:\s)([^\n]+)', contents[senderIndex]).group(2).strip()
    recipient = re.search('(To:\s)([^\n]+)', contents[recipientIndex]).group(2).strip()
    subject = re.search('(Subject:\s)([^\n]+)', contents[subjectIndex]).group(2).strip()
    body = contents[textIndex:]
    
    cleaned = filter(lambda l: re.search(removeRegex, l), body)
    text = " ".join([subject] + contents[textIndex:]).replace("\n", "").strip()
    
    words = word_tokenize(text)
    processed = " ".join([ w for w in words if w not in stopwords ])
    
    return EmailUnit(owner, sender, recipient, subject, text, processed)

In [8]:
# Takes in the root path to an email account and returns all EmailUnits in that account.
def parseEmailAccount(root):
    emailPaths = [path for direc in listdir_nohidden(root) for path in listdir_nohidden(direc)]
    emails = filter(None, [parseEmail(path, root[root.rfind('/') + 1:]) for path in emailPaths])
    return emails

In [9]:
emails = parseEmailAccount("../Data/allen-p")
vectorizer = CountVectorizer(min_df=1)
corpus = [email.text for email in emails]
processedCorpus = [email.processed for email in emails]

X = vectorizer.fit_transform(processedCorpus)
feature_names = vectorizer.get_feature_names()
transformer = TfidfTransformer()
X_tfidf = transformer.fit_transform(X)

In [10]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [13]:
nmf = NMF(n_components=100, init='random', random_state=0)
nmf.fit(X_tfidf)
print_top_words(nmf, feature_names, 12)

Topic #0:
03 2001 allen 01 pm phillip cc forwarded subject to hou am
Topic #1:
program trading chad track rotation bland resume ted risk interview train desks
Topic #2:
doc mime file system using attachment message ms pegasus compliant mailer davis
Topic #3:
com www http chnlownr hearme vc2 nytimes greenbuilder item pallen autoweb hemp
Topic #4:
maps 2000 07 6756 brodeur stephane 16 403 974 03 allen map
Topic #5:
rentroll lucy questions here going week 32 12 write 20a deposit apartment
Topic #6:
message sent original from 2001 phillip subject allen to monday fw mailto
Topic #7:
partnership would let like jacques engineer work know architect need agreements engineering
Topic #8:
ect hou 2000 allen pm 07 forwarded 11 cc subject 04 from
Topic #9:
distinguish detailing mark sells buys vs trades september spreadsheet socal 29 08
Topic #10:
attorney wholesale communication conf western activities 05 call ferc power asserted steffes
Topic #11:
06 gas webcl1 ecthou hoskins nt brian password re

In [14]:
lda = LatentDirichletAllocation(n_topics=100, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)

lda.fit(X)
print_top_words(lda, feature_names, 12)

Topic #0:
ect hou 2000 00 scott 09 pm meeting phillip allen physical gas
Topic #1:
kind paine exclusive weber sdli rising eog jdsu exercising starting oil 3d
Topic #2:
quar psa major mega frequency accommodating technically marshall deffner gov rt governmental
Topic #3:
chance magazine resp homework professionals ease opportuni eb1672 sip recomm 5552 begun
Topic #4:
image change story full oil close nov 000 216 nmail 237 insight
Topic #5:
curves curve saved gd here drop night look improvements socal pulls menu
Topic #6:
save click oz garden saws icon reg folder phillip messenger mignons roehm
Topic #7:
00 zdnet com http cgi slink 8593142 thru sat ct adeskb center
Topic #8:
project allen proforma com loans spy calculated estimat lowry opposite overdyke hou
Topic #9:
inter tuning 3175 comply gain 2101 advisor 981 prebid hdx options fronts
Topic #10:
curb motion weatherford density patterson compromise landowners seniors mcbride hughson craddock mihalkanin
Topic #11:
nwpl chicago katy per

In [16]:
lsa = TruncatedSVD(n_components=100)
lsa.fit(X) 
print_top_words(lsa, feature_names, 12)
print lsa.components_

Topic #0:
ect enron hou enronxgate corp ees na com enron_development 09 20 allen
Topic #1:
09 image 20 090 00 full story 10 09go com the 2001
Topic #2:
00 ees com enron 2001 20 10 3d mail power pm na
Topic #3:
enron na corp hr outlook mgusa owa migration ees 09 access nyc
Topic #4:
ees com hou mail 20 sce http 01 cash cpuc richard the
Topic #5:
3d 2001 networks tax us nortel net 30 2000 results loss operations
Topic #6:
com image http 00pm 12 the showtimes 30pm mail zdnet www phillip
Topic #7:
10 2001 00pm showtimes 30pm 12 11 15pm image 45pm pm thru
Topic #8:
image ees 50 power 3d enronxgate prices change said story 00pm showtimes
Topic #9:
enronxgate image com 2001 mail 00 http thru sat zdnet ct london
Topic #10:
enronxgate 00pm showtimes 30pm 15pm 45pm mail 3d 12 com ees 00
Topic #11:
enronxgate 20 would the 000 loan buy loans costs cost 2001 land
Topic #12:
buy 20 downgraded 2000 coverage 2001 initiated strong http power upgraded 50
Topic #13:
20 http zdnet www ect cgi com slink sc

In [23]:
print lsa.components_.shape
print lda.components_.shape
print nmf.components_.shape

print X.toarray().shape
print X_tfidf.toarray().shape
print len(processedCorpus)

(100, 17600)
(100, 17600)
(100, 17600)
(3034, 17600)
(3034, 17600)
3034


In [None]:
for t in corpus[:30]:
    sentences = sentenceDetector.tokenize(t)
    sentenceUnits = [ SentenceUnit(s) for s in sentences ]
    keyWords = textrank.textrank_keyword(sentenceUnits)
    print keyWords

In [None]:
sorted_x = sorted(keyWords.items(), key=operator.itemgetter(1), reverse=True)
print sorted_x