## About
Topic modelling using LDA. It is unsupervised learning statistical method for topic modelling.


Dataset - https://www.kaggle.com/datasets/gauravduttakiit/npr-data



In [1]:
import pandas as pd
import re
from sklearn.feature_extraction import text
from sklearn.decomposition import LatentDirichletAllocation
import nltk
import gensim
import spacy
from sklearn.model_selection import GridSearchCV
import numpy as np



In [3]:
data = pd.read_csv('/content/drive/MyDrive/Datasets/npr.csv')


In [4]:
data.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [5]:
# cleaning the data
list_data = data.Article.values.tolist()
# Remove new line characters
list_data = [re.sub(r'\s+', ' ', sent) for sent in list_data]
print(data[:1])

                                             Article
0  In the Washington of 2016, even when the polic...


In [6]:
# tokenising each sentence into words after ignoring puncts
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True flag is used to remove punctuations.

words_list = list(sent_to_words(list_data))

In [7]:
print(words_list[0][:])

['in', 'the', 'washington', 'of', 'even', 'when', 'the', 'policy', 'can', 'be', 'bipartisan', 'the', 'politics', 'cannot', 'and', 'in', 'that', 'sense', 'this', 'year', 'shows', 'little', 'sign', 'of', 'ending', 'on', 'dec', 'when', 'president', 'obama', 'moved', 'to', 'sanction', 'russia', 'over', 'its', 'alleged', 'interference', 'in', 'the', 'election', 'just', 'concluded', 'some', 'republicans', 'who', 'had', 'long', 'called', 'for', 'similar', 'or', 'more', 'severe', 'measures', 'could', 'scarcely', 'bring', 'themselves', 'to', 'approve', 'house', 'speaker', 'paul', 'ryan', 'called', 'the', 'obama', 'measures', 'appropriate', 'but', 'also', 'overdue', 'and', 'prime', 'example', 'of', 'this', 'administration', 'ineffective', 'foreign', 'policy', 'that', 'has', 'left', 'america', 'weaker', 'in', 'the', 'eyes', 'of', 'the', 'world', 'other', 'gop', 'leaders', 'sounded', 'much', 'the', 'same', 'theme', 'we', 'have', 'been', 'urging', 'president', 'obama', 'for', 'years', 'to', 'take',

In [8]:
#lemmatisation -> We'll be able to reduce number of unique words which will result in dense document-word matrix.

def lemmatise(texts,allowed_tags = ['NOUN','ADJ','VERB','ADV']):
    out = []
    for tokens in texts:
        doc = nlp(" ".join(tokens))
        out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_tags]))
    return out

In [9]:
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])


In [10]:
lemma_words = lemmatise(words_list,allowed_tags=['NOUN','VERB'])


In [11]:
print(lemma_words)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [12]:
count_vectorizer = text.CountVectorizer(max_df=0.9, min_df=0.1, stop_words='english')
count_vector_matrix = count_vectorizer.fit_transform(lemma_words)

In [13]:
count_vector_matrix

<11992x297 sparse matrix of type '<class 'numpy.int64'>'
	with 690812 stored elements in Compressed Sparse Row format>

In [14]:
LDA = LatentDirichletAllocation(n_components=20, random_state=54,max_iter=20, learning_method='online',batch_size=128, evaluate_every=-1,n_jobs=-1)

lda_output= LDA.fit(count_vector_matrix)

In [15]:
# evaluating model's performance
print("Log Likelihood score :: {}".format(LDA.score(count_vector_matrix)))

#evaluate model perplexity
print("Perplexity :: {}".format(LDA.perplexity(count_vector_matrix)))

Log Likelihood score :: -6654919.38444852
Perplexity :: 195.90863838269587


For a better model, Log likelohood should be high and perplexity should be low.

#### Use GridSearchCV to filter out the best params for the model.

In [16]:
print(LDA.get_params())

{'batch_size': 128, 'doc_topic_prior': None, 'evaluate_every': -1, 'learning_decay': 0.7, 'learning_method': 'online', 'learning_offset': 10.0, 'max_doc_update_iter': 100, 'max_iter': 20, 'mean_change_tol': 0.001, 'n_components': 20, 'n_jobs': -1, 'perp_tol': 0.1, 'random_state': 54, 'topic_word_prior': None, 'total_samples': 1000000.0, 'verbose': 0}


In [18]:
#using grid search_CV

params_dict = {
    'n_components':[10,20],
    'learning_decay':[0.1,0.4,0.9]
}
LDA = LatentDirichletAllocation(max_iter=10, learning_method='online',random_state=42)

#grid search
model = GridSearchCV(LDA, param_grid=params_dict)

model.fit(count_vector_matrix)


GridSearchCV(estimator=LatentDirichletAllocation(learning_method='online',
                                                 random_state=42),
             param_grid={'learning_decay': [0.1, 0.4, 0.9],
                         'n_components': [10, 20]})

In [19]:
#printing out the best parameter
best_model = model.best_estimator_

# best params
print("Model's Params:", model.best_params_)

#best log likelihood score
print("Best Log Likelihood score :: {}".format(model.best_score_))

# perplexity
print("Perplexity {}".format(best_model.perplexity(count_vector_matrix)))

Model's Params: {'learning_decay': 0.4, 'n_components': 10}
Best Log Likelihood score :: -1342259.2498857845
Perplexity 197.54357244114308


In [21]:
# method to predict top n keywords for each topic

def show_topics(vectorizer=count_vectorizer, lda_model=best_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [22]:
topic_keywords = show_topics(vectorizer=count_vectorizer, lda_model=best_model, n_words=20)




In [23]:
#creating a dataframe to visualise
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14,Word 15,Word 16,Word 17,Word 18,Word 19
Topic 0,woman,study,use,people,research,test,percent,risk,case,health,year,result,cause,group,publish,report,suggest,include,problem,lead
Topic 1,school,student,kid,program,learn,year,parent,child,state,help,need,work,spend,lot,study,read,use,test,start,group
Topic 2,child,family,parent,mother,woman,home,live,kid,life,day,leave,community,house,year,bear,help,age,tell,school,country
Topic 3,company,food,make,year,water,use,world,country,people,work,business,city,come,time,sell,grow,day,way,build,need
Topic 4,law,state,government,case,court,right,rule,report,issue,official,administration,decision,use,information,agency,include,country,make,year,statement
Topic 5,health,people,care,percent,pay,year,program,plan,job,work,cost,state,money,service,need,help,make,benefit,policy,increase
Topic 6,people,think,know,make,time,way,thing,year,come,want,work,feel,life,story,ve,tell,write,look,lot,book
Topic 7,trump,campaign,president,news,tell,election,medium,make,candidate,policy,question,time,week,country,statement,comment,ask,speak,think,interview
Topic 8,vote,state,election,win,party,candidate,year,percent,campaign,support,people,point,make,lose,country,day,need,president,lead,run
Topic 9,report,police,people,city,attack,tell,kill,year,accord,force,death,man,war,day,video,group,time,country,news,die


Reference - https://yanlinc.medium.com/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6