# Topic Modeling

In [19]:
import pandas as pd
import numpy as np
import gensim

In [20]:
### ONLY RUN ONCE ####
# import nltk
# nltk.download('wordnet')

In [21]:
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
# from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

path_emails = 'hillary-clinton-emails/Emails.csv'

emails = pd.read_csv(path_emails, usecols = ['SenderPersonId', 'ExtractedSubject', 'ExtractedBodyText'])
emails.fillna(' ', inplace = True)
emails['subject_body'] = emails['ExtractedSubject'] + ' ' + emails['ExtractedBodyText']
emails_raw = np.asarray(emails['subject_body'])

# tokenization
email_tokens = []

for email in emails_raw:
    email_tokens.append(regexp_tokenize(email, pattern='\w+'))

# remove stopwords and digits/numbers
stop_words = set(stopwords.words('english'))
stopwords_emails = ['fyi', 'fm', 'am', 'pm', 'n\'t', 'sent', 'from', 'to', 'subject', 'fw', 'fwd', 'fvv',
                    'cc', 'bcc', 'attachments', 're', 'date', 'html', 'php']
stop_words.update(stopwords_emails)

email_clean_tokens = []

for email in email_tokens:
    clean_tokens = [token for token in email if token.lower() not in stop_words and token.isdigit() == False]
    email_clean_tokens.append(clean_tokens)

# lemmatization
# stemmer = PorterStemmer()
wnl = WordNetLemmatizer()

email_lemma = []

for email in email_clean_tokens:
    lemma = [wnl.lemmatize(token.lower()) for token in email]
    email_lemma.append(lemma)
    
# remove tokens that are too small
email_clean = []

for email in email_lemma:
    clean = [token for token in email if len(token) > 1]
    email_clean.append(clean)

In [22]:
dictionary = gensim.corpora.Dictionary(email_clean)
corpus = [dictionary.doc2bow(email) for email in email_clean]

In [23]:
# number_topics = [5, 10, 15, 20, 25, 30, 35, 40, 45, 50]
number_topics = [5, 10]

for n in number_topics:
    lda_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=n, id2word=dictionary, passes=10)
   
    print('#### Model with', n, 'topics ####')
   
    topic_id = 1
   
    for topic in lda_model.show_topics(num_topics=n, num_words=10, log=False, formatted=False):
        
        string_words = ''
        
        for word in topic[1]:
            string_words += (word[0] + ' ')
        
        print('topic #', topic_id, ':', string_words)

        topic_id += 1

#### Model with 5 topics ####
topic # 1 : see good work get think know haiti speech also would 
topic # 2 : obama state said would american one president government new year 
topic # 3 : call state gov tomorrow talk b6 cheryl ok huma today 
topic # 4 : secretary office state department meeting room arrive route en depart 
topic # 5 : party election vote sid percent republican conservative reuters poll voter 
#### Model with 10 topics ####
topic # 1 : state woman people government united work security country effort world 
topic # 2 : call tomorrow talk get see know want today back would 
topic # 3 : state b6 doc benghazi information case department agreement part dept 
topic # 4 : china chinese india eu europe year branch blair wjc would 
topic # 5 : office secretary meeting room state department arrive route en depart 
topic # 6 : ireland northern uup robinson book dup sbwhoeop unionist website kurt 
topic # 7 : right ap tea american beck reuters iran party israeli skousen 
topic # 8 