## Topic Modelling

We have downloaded some speeches by B. Obama and M. Romney made during US Presidential election 2012. The speeches are uploaded on GitHUB and we would try to find some imortant topics each of their speeches consisted of. We will do Topic modelling using Gensim and visualize these topics using a womderful package called pyLDAvis.

**NOTE:** If pyLDAvis visualization diagrams are NOT visible in this notebook, refer to this jupyter nbviewer version 
https://nbviewer.jupyter.org/github/Rajesh-Nair/Text-modelling-on-speeches/blob/master/Topic%20Modelling%20using%20gensim%20%28LSI%20and%20LDA%29.ipynb


In [30]:
# Find folders and text under a given path
import os

# For dataframe/tables
import pandas as pd

# NLTK package
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Gensim
from gensim import corpora
from gensim import models

# Visualization
import pyLDAvis
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [31]:
# Text document folder
folder = 'speeches'

# Stop words set
ignorewords = set(stopwords.words("english"))

# Lemmatizer
lemmatizer = WordNetLemmatizer().lemmatize

#define dictionary  and corpus
dictionary = corpora.Dictionary()
corpus = list()

# Clean text
# 1. Word Tokenize text
# 2. Ignore stopwords and non alpha characters, words less than 4 in length
# 3. conver to lower case
# 4. Lemmatize the words
# 5. Update/Add to Dictionary
# 6. Create Bag of Words
def clean_corpus(text) :
    textlist = list()
    textlist.append([lemmatizer(word.lower()) for word in word_tokenize(text) \
                       if word not in ignorewords and word.isalpha() and len(word) > 4 ])
    dictionary.add_documents(textlist)
    for text in textlist :
        corpus.append(dictionary.doc2bow(text, allow_update=True))    
    return(None)



In [32]:
# Read, clean and convert the speeches in to Tf-idf
corpus_tfidf = dict()
for fld in  os.listdir(folder) :
    corpus = [] # Re-Initialize    
    for file in os.listdir(folder + '\\' + fld) :
        txtfile = folder + '\\' + fld + '\\' + file
        with open(txtfile, 'r',errors='ignore') as fobj :        
            clean_corpus(fobj.read())       
    # Tf-idf model
    Tfidf = models.TfidfModel(corpus, smartirs='ntc')
    corpus_tfidf[fld] = Tfidf[corpus]    

# Store the dictionary
corpus_dict = dictionary

In [33]:
# LSI model
lsi = dict()
num_topics = 5 
num_words = 15

for fld in corpus_tfidf.keys() :
    lsi[fld] = models.LsiModel(corpus_tfidf[fld], id2word=corpus_dict, num_topics=num_topics)


In [34]:
# Display the topics and words associated
for fld in corpus_tfidf.keys() :    
    print('*'*80)
    print(fld.upper())
    print(pd.DataFrame({"Topic "+ str(i+1):[topic[0] for topic in lsi[fld].show_topic(i,num_words)] \
                                    for i in range(num_topics)}, index = [i+1 for i in range(num_words)] ))

********************************************************************************
ROMNEY
    Topic 1     Topic 2       Topic 3     Topic 4       Topic 5
1   liberal       found         reply       peace      applause
2     found   searching        cancel      regime       dreamer
3    crisis   requested         leave        ally         plant
4   program     perhaps        falcon     dreamer        saving
5    school     nothing         berta      saving      remember
6    budget       could       library      weapon        little
7     going     liberal        valley    applause  unidentified
8    little       going      provided     veteran      audience
9    dollar      crisis         tweet       plant         going
10   nation      budget          view     iranian     attacking
11  percent       reply  presidential       build    imaginable
12    build    strategy      applause        wage         meant
13  company      little        ronald  capability    unfairness
14   always  str

In [35]:
# LDA model
lda = dict()
num_topics = 5 
num_words = 15
num_of_passes = 20

for fld in corpus_tfidf.keys() :
    lda[fld] = models.LdaModel(corpus_tfidf[fld], id2word=corpus_dict, num_topics=num_topics, passes = num_of_passes)


In [36]:
# Display the topics and words associated
for fld in corpus_tfidf.keys() :  
    print('*'*80)
    print(fld.upper())
    print(pd.DataFrame({"Topic "+ str(i+1):[topic[0] for topic in lda[fld].show_topic(i,num_words)] \
                                    for i in range(num_topics)}, index = [i+1 for i in range(num_words)] ))

********************************************************************************
ROMNEY
         Topic 1    Topic 2       Topic 3       Topic 4   Topic 5
1       applause      found         reply     christian   comment
2        liberal  searching        cancel         jerry  remember
3          going  requested         leave  unidentified    closed
4        teacher    perhaps         tweet        christ    crisis
5         school      berta          view         chuck     peace
6   unidentified     falcon       prevail       logical   liberal
7        student    nothing      indicate        truett    weapon
8         choice      could       ethical       worldly  strategy
9          build      kerry        regime  christianity   mission
10       percent      derek       citadel      cheerful    regime
11        little    vietnam      granholm     spiritual     three
12       library     desert     criticism       trivial      ally
13        always    winning       israeli       falwel

In [37]:
# Visualize - LDA using pyLDAvis
vis = dict()
for fld in corpus_tfidf.keys() :
    vis[fld] = pyLDAvis.gensim.prepare(lda[fld],corpus=corpus_tfidf[fld],dictionary=corpus_dict)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [38]:
# Print topics in Romney's speeches
print(list(corpus_tfidf.keys())[0])
vis[list(corpus_tfidf.keys())[0]]

romney


In [39]:
# Print topics in Obama's speeches
print(list(corpus_tfidf.keys())[1])
vis[list(corpus_tfidf.keys())[1]]

obama
