# I.Import Necessary Dependencies and Settings¶

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import re
import nltk
import spacy
import gensim
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
pd.options.display.max_colwidth = 200
%matplotlib inline

# II.Dataset Import

In [2]:
df = pd.read_csv('UnlistedEnglish1.csv', error_bad_lines=False)
df.head()

Unnamed: 0,score,text
0,1,rip pang lakad damit lang pang bahay oustduterte
1,2,ever since rise fall dictator ferdinand marcos student youth forefront tyranny fascism corruption john robert de castro amp natalie nicole julian aster lopez read neverfor...
2,2,ika apatnaput siyam taon nang ipinataw bata militar nakikiisa panday sining bacoor handog tula alalahanin nangahas biktima malagim unos diktadurang marcos neverforget marcos...
3,1,neverforget neveragain oustduterte impeachduterte
4,2,uy gagi wag mong palampasin araw upang aralin rise fall marcos dictatorship dalo kayo ed neveragain neverforget oustduterte


# III.Data Cleaning

In [3]:
# Convert to list
data = df.text.values.tolist()
# Remove Emails
data = [re.sub(r'\S*@\S*\s?', '', sent) for sent in data]
# Remove new line characters
data = [re.sub(r'\s+', ' ', sent) for sent in data]
# Remove distracting single quotes
data = [re.sub(r"\'", "", sent) for sent in data]
print(data[:1])

[' rip pang lakad damit lang pang bahay oustduterte ']


### Tokenize

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
data_words = list(sent_to_words(data))
print(data_words[:1])

[['rip', 'pang', 'lakad', 'damit', 'lang', 'pang', 'bahay', 'oustduterte']]


### Stemming

In [5]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']): #'NOUN', 'ADJ', 'VERB', 'ADV'
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

In [6]:
nlp = spacy.load("en_core_web_sm")
# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'VERB']) #select noun and verb
print(data_lemmatized[:2])

['oustduterte', 'rise fall dictator student youth forefront corruption read neverforget oustduterte']


# IV.Create the Document-Word matrix

In [7]:
vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,
# minimum reqd occurences of a word 
                             stop_words='english',             
# remove stop words
                             lowercase=True,                   
# convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  
# num chars > 3
                             max_features=50000)           
# max number of uniq words    
data_vectorized = vectorizer.fit_transform(data_lemmatized)

# V.Build LDA model with sklearn

In [8]:
# Build LDA Model
lda_model = LatentDirichletAllocation(n_components=5,               # Number of topics
                                      max_iter=10,               
# Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          
# Random state
                                      batch_size=128,            
# n docs in each learning iter
                                      evaluate_every = -1,       
# compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               
# Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)
print(lda_model)  # Model attributes

LatentDirichletAllocation(learning_method='online', n_components=5, n_jobs=-1,
                          random_state=100)


In [9]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None, evaluate_every=-1, learning_decay=0.7,
                          learning_method='online', learning_offset=10.0, max_doc_update_iter=100, 
                          max_iter=10, mean_change_tol=0.001, n_components=5, n_jobs=-1, 
                          perp_tol=0.1,random_state=100, topic_word_prior=None, total_samples=1000000.0, verbose=0)

LatentDirichletAllocation(learning_method='online', n_components=5, n_jobs=-1,
                          random_state=100)

# VI.Diagnose model performance with perplexity and log-likelihood

In [10]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))
# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))
# See model parameters
print(lda_model.get_params())

Log Likelihood:  -38029.58362581908
Perplexity:  196.30837118342106
{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 10, 'mean_change_tol': 0.001, 'n_components': 5, 'n_jobs': -1, 'perp_tol': 0.1, 'random_state': 100, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [11]:
# Define Search Param
search_params = {'n_components': [5, 6, 7, 8, 9], 'learning_decay': [.5, .7, .9]}
# Init the Model
lda = LatentDirichletAllocation(max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)
# Do the Grid Search
model.fit(data_vectorized)

GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 learning_offset=50.0,
                                                 max_iter=5, random_state=0),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [5, 6, 7, 8, 9]})

In [12]:
# Best Model
best_lda_model = model.best_estimator_
# Model Parameters
print("Best Model's Params: ", model.best_params_)
# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)
# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -9695.10994052303
Model Perplexity:  209.5061597009758


# VII.Dominant topic

In [13]:
# Create Document — Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)
# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]
# index names
docnames = ["Doc" + str(i) for i in range(len(data))]
# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)
# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic
# Styling
def color_green(val):
 color = 'green' if val > .1 else 'black'
 return 'color: {col}'.format(col=color)
def make_bold(val):
 weight = 700 if val > .1 else 400
 return 'font-weight: {weight}'.format(weight=weight)
# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.1,0.1,0.6,0.1,0.1,2
Doc1,0.03,0.03,0.43,0.03,0.48,4
Doc2,0.1,0.1,0.6,0.1,0.1,2
Doc3,0.07,0.07,0.73,0.07,0.07,2
Doc4,0.04,0.04,0.84,0.04,0.04,2
Doc5,0.2,0.2,0.2,0.2,0.2,0
Doc6,0.07,0.07,0.07,0.07,0.73,4
Doc7,0.03,0.03,0.15,0.77,0.03,3
Doc8,0.2,0.2,0.2,0.2,0.2,0
Doc9,0.07,0.07,0.07,0.07,0.73,4


In [14]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)
# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names()
df_topic_keywords.index = topicnames
# View
df_topic_keywords.head()

Unnamed: 0,action,add,address,admin,administration,agree,allege,allow,amp,anibersaryo,...,wake,want,war,watch,way,win,work,worker,world,year
Topic0,0.202724,8.429892,0.202612,0.200575,0.20228,11.437084,12.736883,0.205194,0.203374,0.200283,...,0.200769,31.792587,0.203731,0.203978,0.204528,0.202887,0.202218,0.203243,0.203283,0.20274
Topic1,0.204075,0.204099,9.896207,0.200922,23.195049,0.202278,0.205569,11.998768,0.201364,0.200474,...,0.20242,0.203185,28.223478,0.20298,0.204837,22.508604,0.201835,0.201251,0.204475,40.147667
Topic2,0.201967,0.200448,0.200019,0.201499,0.201001,0.200326,0.202138,0.201986,0.659488,16.503093,...,0.200257,0.202738,0.200835,0.201285,0.200702,0.201058,0.201679,11.422106,0.201326,0.2026
Topic3,8.665648,0.200877,0.201132,0.202665,0.203976,0.201159,0.201747,0.200286,83.077549,0.201837,...,8.808743,0.204539,0.20372,18.072496,24.961097,0.202665,0.203218,0.202525,0.20177,12.175404
Topic4,0.201495,0.200655,0.202016,13.342689,0.204564,0.203306,0.203828,0.204975,0.205942,0.200724,...,0.201529,15.497833,0.202983,0.202288,0.203604,0.202892,28.74175,0.201344,10.4505,0.202735


### Get the top 15 keywords each topic:


In [15]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=100):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords
topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=100)
# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
pd.set_option('display.max_columns', None)
df_topic_keywords.head()


Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19,Word 20,Word 21,Word 22,Word 23,Word 24,Word 25,Word 26,Word 27,Word 28,Word 29,Word 30,Word 31,Word 32,Word 33,Word 34,Word 35,Word 36,Word 37,Word 38,Word 39,Word 40,Word 41,Word 42,Word 43,Word 44,Word 45,Word 46,Word 47,Word 48,Word 49,Word 50,Word 51,Word 52,Word 53,Word 54,Word 55,Word 56,Word 57,Word 58,Word 59,Word 60,Word 61,Word 62,Word 63,Word 64,Word 65,Word 66,Word 67,Word 68,Word 69,Word 70,Word 71,Word 72,Word 73,Word 74,Word 75,Word 76,Word 77,Word 78,Word 79,Word 80,Word 81,Word 82,Word 83,Word 84,Word 85,Word 86,Word 87,Word 88,Word 89,Word 90,Word 91,Word 92,Word 93,Word 94,Word 95,Word 96,Word 97,Word 98,Word 99
Topic 0,respect,vote,country,dilawan,use,test,run,health,help,feel,covid,follow,leni,ask,know,leader,say,court,want,let,read,make,bbmforpresident,people,lead,care,register,presidente,continue,probe,vice,mother,news,love,eleksyon,friend,voter,question,opinion,allege,protect,agree,filipino,hate,cross,kit,magige,anniversary,proof,member,victim,place,add,supply,duqueresign,decide,duterterapist,dutertemanyak,philippine,problem,issue,include,oustduterte,report,support,case,choice,law,duterte,month,presidency,candidate,crime,reconsider,testing,halalan,bbm,order,need,nation,malake,choose,plan,look,president,freedom,record,fund,allow,result,family,history,start,drug,way,think,opposition,hour,position,office
Topic 1,presidency,kayo,election,face,hope,case,file,problem,make,drug,year,come,war,shield,doh,class,administration,pre,win,policy,halalan,sya,kill,position,wag,build,result,bring,reconsider,change,apologist,leave,date,investigation,nation,defend,pwede,plan,choose,tandem,man,month,return,kaso,point,pilipino,allow,address,nating,icc,learn,pag,crime,narrative,hour,campaign,filipino,duterte,office,decide,report,victim,include,run,decision,philippine,choice,ask,vice,candidate,mayor,covid,testing,court,mind,oustduterte,allege,read,mother,state,respect,continue,say,freedom,record,probe,test,proof,way,follow,world,time,help,order,protect,think,add,action,dictatorship,happen
Topic 2,neveragain,oustduterte,president,neverforget,gale,story,government,daw,oustdutertenow,journalist,history,mayor,regime,dutertewakasan,budget,anibersaryo,bayan,pala,peace,magnanakaw,hear,lie,vice,bata,worker,order,diktadurya,sign,number,let,remember,meet,patuloy,student,statement,dictatorship,level,supply,amp,fight,marcose,muna,truth,debate,support,slap,link,official,try,life,mind,law,run,freedom,anniversary,campaign,duqueresign,talaga,fund,bansa,niyo,protect,change,pag,join,decide,dictator,love,record,elcac,read,proof,nating,pilipino,say,view,malake,victim,want,office,member,year,day,hope,response,today,deserve,come,time,look,choose,mean,corruption,health,kill,allege,state,pina,interview,allow
Topic 3,law,time,choice,people,amp,bbm,talaga,life,report,include,day,happen,pilipina,aquino,marcose,tell,look,way,power,start,talk,wait,truth,dami,family,pre,view,watch,mind,believe,issue,speak,pina,niyo,interview,deserve,candidacy,year,response,decide,stand,fight,list,person,development,protest,police,lose,hirap,link,thing,kame,wake,know,action,state,guy,join,today,toni,freedom,number,muna,dilawan,story,presidency,gale,remember,try,respect,leave,problem,candidate,bbmforpresident,filipino,victim,return,case,think,lol,choose,hope,support,record,anniversary,dictator,campaign,say,hear,opinion,malake,peace,hahaha,court,member,come,feel,want,budget,philippine
Topic 4,duterte,say,candidate,support,need,try,think,muna,oust,opposition,philippine,supporter,thank,corruption,work,run,official,slap,end,bansa,naman,lol,stop,fact,group,puro,read,enabler,post,campaign,hahaha,testing,want,mean,fund,forget,dictator,debate,record,admin,imagine,davao,hahahaha,decision,office,thing,attack,malake,dictatorship,elcac,world,statement,politic,twitter,level,remember,lng,today,state,patuloy,crime,month,freedom,student,fight,vote,journalist,opinion,let,choice,story,presidency,respect,speak,family,return,mind,life,oustduterte,court,gale,probe,amp,mayor,budget,president,continue,number,allow,pre,list,know,candidacy,administration,vice,drug,case,change,allege,people


### Next, put the 5 topics we infered into the dataframe.


In [16]:
Topics = ["Election","Government Policies", "Recollection", "Filipino Political News","Politican Stance"]
df_topic_keywords["Topics"]=Topics
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19,Word 20,Word 21,Word 22,Word 23,Word 24,Word 25,Word 26,Word 27,Word 28,Word 29,Word 30,Word 31,Word 32,Word 33,Word 34,Word 35,Word 36,Word 37,Word 38,Word 39,Word 40,Word 41,Word 42,Word 43,Word 44,Word 45,Word 46,Word 47,Word 48,Word 49,Word 50,Word 51,Word 52,Word 53,Word 54,Word 55,Word 56,Word 57,Word 58,Word 59,Word 60,Word 61,Word 62,Word 63,Word 64,Word 65,Word 66,Word 67,Word 68,Word 69,Word 70,Word 71,Word 72,Word 73,Word 74,Word 75,Word 76,Word 77,Word 78,Word 79,Word 80,Word 81,Word 82,Word 83,Word 84,Word 85,Word 86,Word 87,Word 88,Word 89,Word 90,Word 91,Word 92,Word 93,Word 94,Word 95,Word 96,Word 97,Word 98,Word 99,Topics
Topic 0,respect,vote,country,dilawan,use,test,run,health,help,feel,covid,follow,leni,ask,know,leader,say,court,want,let,read,make,bbmforpresident,people,lead,care,register,presidente,continue,probe,vice,mother,news,love,eleksyon,friend,voter,question,opinion,allege,protect,agree,filipino,hate,cross,kit,magige,anniversary,proof,member,victim,place,add,supply,duqueresign,decide,duterterapist,dutertemanyak,philippine,problem,issue,include,oustduterte,report,support,case,choice,law,duterte,month,presidency,candidate,crime,reconsider,testing,halalan,bbm,order,need,nation,malake,choose,plan,look,president,freedom,record,fund,allow,result,family,history,start,drug,way,think,opposition,hour,position,office,Election
Topic 1,presidency,kayo,election,face,hope,case,file,problem,make,drug,year,come,war,shield,doh,class,administration,pre,win,policy,halalan,sya,kill,position,wag,build,result,bring,reconsider,change,apologist,leave,date,investigation,nation,defend,pwede,plan,choose,tandem,man,month,return,kaso,point,pilipino,allow,address,nating,icc,learn,pag,crime,narrative,hour,campaign,filipino,duterte,office,decide,report,victim,include,run,decision,philippine,choice,ask,vice,candidate,mayor,covid,testing,court,mind,oustduterte,allege,read,mother,state,respect,continue,say,freedom,record,probe,test,proof,way,follow,world,time,help,order,protect,think,add,action,dictatorship,happen,Government Policies
Topic 2,neveragain,oustduterte,president,neverforget,gale,story,government,daw,oustdutertenow,journalist,history,mayor,regime,dutertewakasan,budget,anibersaryo,bayan,pala,peace,magnanakaw,hear,lie,vice,bata,worker,order,diktadurya,sign,number,let,remember,meet,patuloy,student,statement,dictatorship,level,supply,amp,fight,marcose,muna,truth,debate,support,slap,link,official,try,life,mind,law,run,freedom,anniversary,campaign,duqueresign,talaga,fund,bansa,niyo,protect,change,pag,join,decide,dictator,love,record,elcac,read,proof,nating,pilipino,say,view,malake,victim,want,office,member,year,day,hope,response,today,deserve,come,time,look,choose,mean,corruption,health,kill,allege,state,pina,interview,allow,Recollection
Topic 3,law,time,choice,people,amp,bbm,talaga,life,report,include,day,happen,pilipina,aquino,marcose,tell,look,way,power,start,talk,wait,truth,dami,family,pre,view,watch,mind,believe,issue,speak,pina,niyo,interview,deserve,candidacy,year,response,decide,stand,fight,list,person,development,protest,police,lose,hirap,link,thing,kame,wake,know,action,state,guy,join,today,toni,freedom,number,muna,dilawan,story,presidency,gale,remember,try,respect,leave,problem,candidate,bbmforpresident,filipino,victim,return,case,think,lol,choose,hope,support,record,anniversary,dictator,campaign,say,hear,opinion,malake,peace,hahaha,court,member,come,feel,want,budget,philippine,Filipino Political News
Topic 4,duterte,say,candidate,support,need,try,think,muna,oust,opposition,philippine,supporter,thank,corruption,work,run,official,slap,end,bansa,naman,lol,stop,fact,group,puro,read,enabler,post,campaign,hahaha,testing,want,mean,fund,forget,dictator,debate,record,admin,imagine,davao,hahahaha,decision,office,thing,attack,malake,dictatorship,elcac,world,statement,politic,twitter,level,remember,lng,today,state,patuloy,crime,month,freedom,student,fight,vote,journalist,opinion,let,choice,story,presidency,respect,speak,family,return,mind,life,oustduterte,court,gale,probe,amp,mayor,budget,president,continue,number,allow,pre,list,know,candidacy,administration,vice,drug,case,change,allege,people,Politican Stance
