# Topic Modelling with LDA

### Imports

In [1]:
import pandas as pd
import os
import collections
import csv
import logging
import numpy as np
import datetime as datetime
import re
import pickle
from pprint import pprint
from tqdm import tqdm
import os

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [2]:
df = pd.read_pickle('./data/bigrams/df_processed_bigrams_top2vec_trg.pickle')
#df = pd.read_pickle('./data/bigrams/df_processed_bigrams.pickle')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365200 entries, 0 to 369046
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   author             181507 non-null  object        
 1   date               365200 non-null  datetime64[ns]
 2   domain             365200 non-null  object        
 3   title              365115 non-null  object        
 4   url                365200 non-null  object        
 5   content            365200 non-null  object        
 6   topic_area         365200 non-null  object        
 7   content_processed  365200 non-null  object        
dtypes: datetime64[ns](1), object(7)
memory usage: 25.1+ MB


In [4]:
df.head(1)

Unnamed: 0,author,date,domain,title,url,content,topic_area,content_processed
0,Thomas Hughes,2020-01-02,marketbeat,Three Industrial Giants You Should Own In 2020,https://www.marketbeat.com/originals/three-ind...,With the end of the year just around the corne...,business,end year corner past time think positioning fo...


In [5]:
def log_newline(self, how_many_lines=1):
    file_handler = None
    if self.handlers:
        file_handler = self.handlers[0]

    # Switch formatter, output a blank line
    file_handler.setFormatter(self.blank_formatter)
    for i in range(how_many_lines):
        self.info('')

    # Switch back
    file_handler.setFormatter(self.default_formatter)

def logger_w2v():
    
    log_file = os.path.join('./data', 'word2vec.log')
    print('log file location: ', log_file)
    
    log_format= '%(asctime)s - %(levelname)s - [%(module)s]\t%(message)s'
    formatter = logging.Formatter(fmt=(log_format))
    
    fhandler = logging.FileHandler(log_file)
    fhandler.setFormatter(formatter)
    
    logger = logging.getLogger('word2vec')
    logger.setLevel(logging.DEBUG)
    logger.addHandler(fhandler)
    logger.default_formatter = formatter
    logger.blank_formatter = logging.Formatter(fmt="")
    logger.newline = types.MethodType(log_newline, logger)
    
    return logger
    

# LDA

https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/  
https://towardsdatascience.com/topic-modeling-and-latent-dirichlet-allocation-in-python-9bf156893c24

LDA considers each document as a collection of topics in a certain proportion. And each topic as a collection of keywords, again, in a certain proportion.

Once you provide the algorithm with the number of topics, it rearranges the topics distribution within the documents and keywords distribution within the topics to obtain a good composition of topic-keywords distribution.

In [9]:
def tokenise_dataset(df):
    tokens = df['content_processed'].str.split(" ")
    return tokens

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
words = tokenise_dataset(df)

lemm = False
# Do lemmatization keeping only noun, adj, vb, adv
# ~ 2 hours to run - 1:45pm
if lemm:
    words = tokenise_dataset(df)
    data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    with open('data/topics/data_lemmatized_211126.pickle', 'wb') as f:
        pickle.dump(data_lemmatized, f, pickle.HIGHEST_PROTOCOL)
else:
    with open('data/topics/data_lemmatized.pickle', 'rb') as f:
    #with open('data/topics/data_lemmatized_211126.pickle', 'rb') as f:    
        data_lemmatized = pickle.load(f)

print(len(data_lemmatized))
print(len(data_lemmatized[:1][0]))
print(len(words))
print(len(words[0]))

365200
507
365200
528


In [11]:
# Create dictionary
dict_top_mod = corpora.Dictionary(data_lemmatized)

print(dict_top_mod)
print(len(dict_top_mod))

# check given words by index
print(dict_top_mod[0], dict_top_mod[10], dict_top_mod[20], dict_top_mod[100], dict_top_mod[101], dict_top_mod[102])

Dictionary(876229 unique tokens: ['2nd', 'addition', 'alltime_high', 'america', 'analyst']...)
876229
2nd ba boeing feed financial find


In [12]:
# Filter out terms
no_below = 25 # Infrequent words: minimum number of documents term must appear in
no_above = 0.3 # Frequent words: remove tokens that appear in this % of documents
keep_n = 100000 # Keep only this many words in dictionary
dict_top_mod.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)

print(len(dict_top_mod))

# check given words by index
print(dict_top_mod[0], dict_top_mod[10], dict_top_mod[20], dict_top_mod[100], dict_top_mod[101], dict_top_mod[102])
print(dict_top_mod)

64310
2nd ba boeing generally get globally
Dictionary(64310 unique tokens: ['2nd', 'addition', 'alltime_high', 'america', 'analyst']...)


In [13]:
# Term document frequency - creates tuples of word frequences (word idx, word_freq), e.g. [[(0,1), (1,4)]]: word at index 0 occurs once in doc, word at index 1 occurs 4 times in doc
corpus_bow = [dict_top_mod.doc2bow(word) for word in data_lemmatized]

print(len(corpus_bow))
print(len(corpus_bow[0]))

# Human readable format of corpus (term-frequency)
[[(dict_top_mod[idx], freq) for idx, freq in cp] for cp in corpus_bow[:1]][0][:10]

365200
266


[('2nd', 1),
 ('addition', 1),
 ('alltime_high', 1),
 ('america', 1),
 ('analyst', 2),
 ('announce', 1),
 ('anti', 1),
 ('aristocrat', 2),
 ('attractive', 1),
 ('august', 1)]

In [14]:
# Build LDA model - BOW - takes ~30 hours

process_lda = False

if process_lda:
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_bow,
                                               id2word=dict_top_mod,
                                               num_topics=250, 
                                               random_state=100,
                                               update_every=1, # Number of documents to be intereated through for each update, 0 for batch learning, 1 for online iterative learning
                                               chunksize=100, # number of docs in training chunk
                                               passes=10, # total number of passes through the corpus during training
                                               alpha='auto', #hyperparamter that affects sparsity of topics, default = 1.0/num_topics
                                               per_word_topics=True)
    lda_model.save('./data/topics/lda_model')
else:
    lda_model = gensim.models.ldamodel.LdaModel.load('./data/topics/lda_model')

In [15]:
# Most signigicant topics per gensim function
lda_model.print_topics(num_topics=10)

[(127,
  '0.000*"bearded" + 0.000*"redlener" + 0.000*"emigre" + 0.000*"disenfranchise" + 0.000*"cubanamerican" + 0.000*"centerright" + 0.000*"henson" + 0.000*"ucsf" + 0.000*"unbelievably" + 0.000*"dobb"'),
 (142,
  '0.000*"bearded" + 0.000*"redlener" + 0.000*"emigre" + 0.000*"disenfranchise" + 0.000*"cubanamerican" + 0.000*"centerright" + 0.000*"henson" + 0.000*"ucsf" + 0.000*"unbelievably" + 0.000*"dobb"'),
 (222,
  '0.000*"bearded" + 0.000*"redlener" + 0.000*"emigre" + 0.000*"disenfranchise" + 0.000*"cubanamerican" + 0.000*"centerright" + 0.000*"henson" + 0.000*"ucsf" + 0.000*"unbelievably" + 0.000*"dobb"'),
 (47,
  '0.000*"bearded" + 0.000*"redlener" + 0.000*"emigre" + 0.000*"disenfranchise" + 0.000*"cubanamerican" + 0.000*"centerright" + 0.000*"henson" + 0.000*"ucsf" + 0.000*"unbelievably" + 0.000*"dobb"'),
 (27,
  '0.000*"bearded" + 0.000*"redlener" + 0.000*"emigre" + 0.000*"disenfranchise" + 0.000*"cubanamerican" + 0.000*"centerright" + 0.000*"henson" + 0.000*"ucsf" + 0.000*"unbe

In [16]:
# Example Topic
lda_model.show_topic(1)

[('advice', 0.3714554),
 ('cdc', 0.36339292),
 ('flu', 0.13451187),
 ('headache', 0.06609882),
 ('claire', 0.03478762),
 ('fever', 0.012384132),
 ('fu', 0.007857959),
 ('testing_kit', 0.0036947185),
 ('cdc_recommend', 0.0024043757),
 ('wear_cloth', 0.00034669874)]

In [17]:
lda_model.show_topic(2)

[('trump', 0.35409158),
 ('president', 0.13913615),
 ('republican', 0.11806645),
 ('congress', 0.117492065),
 ('democrat', 0.0429803),
 ('push', 0.01622854),
 ('aide', 0.016062742),
 ('donald_trump', 0.015214954),
 ('ally', 0.014712363),
 ('political', 0.012757667)]

In [18]:
doc_lda = lda_model[corpus_bow]
doc_lda

<gensim.interfaces.TransformedCorpus at 0x7f0fc15f8340>

### Compute Model Perplexity and Coherence Score

The model with the lowest perplexity is generally considered the 'best'

Topic Coherence scores topics by measuring the degree of semantic similarity between high scoring words in the topic. These measurements help distinguish between topics that are semantically interpretable topics and topics that are artifacts of statistical inference
* C_v measure is based on a sliding window, one-set segmentation of the top words and an indirect confirmation measure that uses normalized pointwise mutual information (NPMI) and the cosine similarity

In [18]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus_bow))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dict_top_mod, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -70.97732312984519

Coherence Score:  0.4013738358204918


### Visualize the topics-keywords

Examine the produced topics and the associated keywords. 

Each bubble on the left-hand side plot represents a topic. The larger the bubble, the more prevalent is that topic.
* A good topic model will have fairly big, non-overlapping bubbles scattered throughout the chart instead of being clustered in one quadrant.
* A model with too many topics, will typically have many overlaps, small sized bubbles clustered in one region of the chart

In [16]:
# Visualize the topics
pyLDAvis.enable_notebook()
#vis = pyLDAvis.gensim_models.prepare(lda_model, corpus_bow, dict_top_mod)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus_bow, dict_top_mod, mds='mmds')
vis

### Finding dominant topic per document

In [20]:
print(len(lda_model[corpus_bow]))

365200


In [43]:
def format_topics_sentences(ldamodel, corpus, texts):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in tqdm(enumerate(ldamodel[corpus])):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

process_topics = False

# Takes 17 hours
if process_topics:
    df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus_bow, texts=df['content_processed'])

    # Format
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    df_dominant_topic.to_pickle('./data/df_lda_dominant_topic_3.pickle')
    
else:
    df_dominant_topic = pd.read_pickle('./data/df_lda_dominant_topic_original.pickle')
    
df_dominant_topic

Unnamed: 0,content,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords
0,With the end of the year just around the corne...,0,39.0,0.1010,"start, give, way, long, look, mean, find, set,..."
1,The labor markets were one of the most closely...,1,39.0,0.0973,"start, give, way, long, look, mean, find, set,..."
2,"It could be forgiven, that some might think th...",2,39.0,0.1443,"start, give, way, long, look, mean, find, set,..."
3,CIOs kicked off 2019 with AI as an item to wat...,3,120.0,0.1765,"technology, application, press_release, ai, zo..."
4,When the coronavirus pandemic is over and life...,4,245.0,0.1583,"fashion, brand, collection, luxury, instagram,..."
...,...,...,...,...,...
369042,Niels Pedersen is a Chartered Accountant and S...,365195,39.0,0.1115,"start, give, way, long, look, mean, find, set,..."
369043,Datametrex AI Limited (TSXV: DM) (FSE: D4G) (O...,365196,98.0,0.1064,"information, statement, risk, condition, abili..."
369044,"This December, The Fintech Times is asking ind...",365197,39.0,0.1097,"start, give, way, long, look, mean, find, set,..."
369045,Item 1.01Entry into a Material Definitive Agre...,365198,98.0,0.0799,"information, statement, risk, condition, abili..."


### Topic distribution across documents
Understand the volume and distribution of topics in order to judge how widely it was discussed.

In [77]:
# Number of Documents for Each Topic
topic_counts = pd.DataFrame(df_dominant_topic['Dominant_Topic'].value_counts())

# Percentage of Documents for Each Topic
topic_contribution = round(topic_counts/topic_counts.sum(), 3)

# Topic words
df_topic_keywords = df_dominant_topic[['Dominant_Topic', 'Keywords']].drop_duplicates()

# Combine
topic_counts = topic_counts.merge(topic_contribution, left_index=True, right_index=True).reset_index()
topic_counts.columns = ['topic_nb', 'documents_per_topic', 'percent_documents']
topic_counts = topic_counts.merge(df_topic_keywords, left_on='topic_nb', right_on='Dominant_Topic')
topic_counts['topic_nb'] = topic_counts['topic_nb'].astype(int)
topic_counts = topic_counts.drop('Dominant_Topic', axis=1)
with pd.option_context('display.max_colwidth', 400):
    display(topic_counts[:10])

Unnamed: 0,topic_nb,documents_per_topic,percent_documents,Keywords
0,39,224645,0.615,"start, give, way, long, look, mean, find, set, make, life"
1,233,28947,0.079,"impact, future, current, base, operation, factor, significant, potential, term, activity"
2,40,13432,0.037,"process, access, challenge, issue, require, effort, action, control, meet, deliver"
3,157,12371,0.034,"public, official, accord, national, nation, concern, department, call, agency, expert"
4,81,9632,0.026,"virus, strain, disease, spread, population, study, human, university, infection, scientist"
5,189,8640,0.024,"fall, close, record, big, remain, low, see, hit, late, level"
6,98,7079,0.019,"information, statement, risk, condition, ability, relate, contain, future, additional, event"
7,15,5943,0.016,"rise, index, gain, yous, sp, point, dow_jone, session, benchmark, strategist"
8,207,4867,0.013,"stock, investor, analyst, buy, rally, trading, earning, bet, raise, value"
9,82,4006,0.011,"team, game, win, player, sport, boris_johnson, play, match, victory, league"


In [78]:
print(topic_counts[topic_counts['documents_per_topic'] <= 25]['documents_per_topic'].sum())
topic_counts[topic_counts['documents_per_topic'] <= 25]

1084


Unnamed: 0,topic_nb,documents_per_topic,percent_documents,Keywords
118,221,25,0.0,"minister, finance, budget, export, pledge, loa..."
119,13,25,0.0,"space, communication, principal, stability, co..."
120,132,25,0.0,"device, battle, demonstrate, fintech, clearly,..."
121,1,25,0.0,"advice, cdc, flu, headache, claire, fever, fu,..."
122,235,25,0.0,"production, produce, processing, pipeline, pil..."
...,...,...,...,...
229,59,1,0.0,"vaccination, committee, recommendation, guidel..."
230,230,1,0.0,"guarantee, closing, status, calculate, equival..."
231,197,1,0.0,"article, select, refer, conversation, century,..."
232,44,1,0.0,"infrastructure, venue, normally, employ, engin..."
