# Topic Modelling

In [1]:
import os
import time
import math
import re
from pprint import pprint
from textblob import TextBlob
import pandas as pd
import numpy as np


import nltk as nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import spacy
import multiprocessing
import string

import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# import pyLDAvis.gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [2]:
from pandarallel import pandarallel
import multiprocessing
num_processors = multiprocessing.cpu_count()
workers = num_processors-1
print(f'Available CPUs: {num_processors}')
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False)

Available CPUs: 64
INFO: Pandarallel will run on 63 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
import pandas as pd

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

In [4]:
import warnings

# warnings.simplefilter('once')
warnings.simplefilter('ignore')
# warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

#### Read data

In [5]:
df = pd.read_csv('df_news_topic.csv')

In [6]:
df.head(2)

Unnamed: 0,token_text
0,artificial improves parking efficiency chinese cities people daily march home artificial improves parking efficiency chinese citiesby liu shiyao people daily march photo taken july shows sign electronic toll collection etc newly set roadside parking space yangzhuang road shijingshan district beijing urban areas city started use etc system roadside parking spaces july people daily online li application artificial ai empowered roadside electronic toll collection etc system china capital city b...
1,children autism saw learning social skills boosted playing ai robot news parliament skip february latest conferences forestall drax power station cease burning coal march coronavirus explode overnight like dettol sales surge markets fall againlevi strauss marks phase corporate paid leave real story world children autism saw learning social skills boosted playing ai robot february february admin comments author recent posts admin latest posts admin conferences forestall february coronavirus e...


#### Apply LDA

In [8]:
%%time
# Tokenize text into words and remove punctuation

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations
        
        
data_list = df['token_text'].tolist()        
data_tokens = list(sent_to_words(data_list))

CPU times: user 1min 51s, sys: 1.93 s, total: 1min 53s
Wall time: 1min 53s


In [9]:
%%time
# Remove Stopwords, Make n-grams and Lemmatize

bigram = gensim.models.Phrases(data_tokens, min_count=1, threshold=1)
trigram = gensim.models.Phrases(bigram[data_tokens], threshold=1)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

CPU times: user 3min 17s, sys: 1.44 s, total: 3min 18s
Wall time: 3min 18s


In [12]:
%%time

stop_words = stopwords.words('english')

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
#     """https://spacy.io/api/annotation"""
#     texts_out = []
#     for sent in texts:
#         doc = nlp(" ".join(sent)) 
#         texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
#     return texts_out

CPU times: user 3.6 ms, sys: 127 µs, total: 3.73 ms
Wall time: 2.07 ms


In [15]:
%%time

# Remove Stop Words
#data_tokens_nostops = remove_stopwords(data_tokens)

# Create n-grams
data_words_bigrams = make_bigrams(data_tokens)
data_words_trigrams = make_trigrams(data_tokens)

# Combine tokens and n-grams
# data_tokens_cobnined = data_tokens_nostops + data_words_bigrams + data_words_trigrams
data_tokens_cobnined = data_words_trigrams

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Lemmatize text keeping only noun, adj, vb, adv
#data_lemmatized = lemmatization(data_tokens_cobnined, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

#print(*data_lemmatized[:1])

CPU times: user 1min 28s, sys: 1.56 s, total: 1min 29s
Wall time: 1min 29s


In [16]:
%%time

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(data_tokens_cobnined)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in data_tokens_cobnined]

CPU times: user 51.1 s, sys: 636 ms, total: 51.8 s
Wall time: 51.8 s


In [20]:
# Tune LDA model
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = LdaMulticore(corpus=doc_term_matrix,
                       id2word=dictionary,
                       num_topics=k,
                       random_state=100,                  
                       passes=10,
                       alpha=a,
                       eta=b,
                       workers=workers)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_tokens_cobnined, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [21]:
#%%time

grid = {}
grid['Validation_Set'] = {}
# Topics range
min_topics = 5
max_topics = 9
step_size = 1
topics_range = range(min_topics, max_topics+1, step_size)

# Alpha parameter
#alpha = list(np.arange(0.01, 1, 0.3))
#alpha.append('symmetric')
#alpha.append('asymmetric')
alpha = ['symmetric'] # Run for number of topics only

# Beta parameter
#beta = list(np.arange(0.01, 1, 0.3))
#beta.append('symmetric')
beta = ['symmetric'] # Run for number of topics only


# Validation sets
num_of_docs = len(doc_term_matrix)
corpus_sets = [# gensim.utils.ClippedCorpus(doc_term_matrix, num_of_docs*0.25), 
               # gensim.utils.ClippedCorpus(doc_term_matrix, num_of_docs*0.5), 
#                gensim.utils.ClippedCorpus(doc_term_matrix, num_of_docs*0.75), 
               doc_term_matrix]
# corpus_title = ['75% Corpus', '100% Corpus']
corpus_title = ['100% Corpus']
model_results = {
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

itr = 0
itr_total = len(beta)*len(alpha)*len(topics_range)*len(corpus_title)
print(f'LDA will execute {itr_total} iterations')

    
# iterate through hyperparameters
for i in range(len(corpus_sets)):
    # iterate through number of topics
    for k in topics_range:
        # iterate through alpha values
        #tic()
        for a in alpha:
            # iterare through beta values
            for b in beta:
                # get the coherence score for the given parameters
                itr += 1
                cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=dictionary, 
                                              k=k, a=a, b=b)
                # Save the model results
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                pct_completed = round((itr / itr_total * 100),1)
#                 print(f'Completed Percent: {pct_completed}%, Corpus: {corpus_title[i]}, Topics: {k}, Alpha: {a}, Beta: {b}, Coherence: {cv}')
        print(f'Completed model based on {k} LDA topics. Finished {pct_completed}% of LDA runs')
        #tac()
                    
lda_tuning = pd.DataFrame(model_results)
#lda_tuning.to_csv(os.path.join(path_lda, 'lda_tuning_results.csv'), index=False)

LDA will execute 5 iterations
Completed model based on 5 LDA topics. Finished 20.0% of LDA runs
Completed model based on 6 LDA topics. Finished 40.0% of LDA runs
Completed model based on 7 LDA topics. Finished 60.0% of LDA runs
Completed model based on 8 LDA topics. Finished 80.0% of LDA runs
Completed model based on 9 LDA topics. Finished 100.0% of LDA runs


In [22]:
# Best LDA parameters
lda_tuning.sort_values(by=['Coherence'], ascending=False).head(10)

Unnamed: 0,Topics,Alpha,Beta,Coherence
4,9,symmetric,symmetric,0.305546
2,7,symmetric,symmetric,0.30075
0,5,symmetric,symmetric,0.285134
1,6,symmetric,symmetric,0.283505
3,8,symmetric,symmetric,0.273411


In [23]:
# Running best model
lda_tuning_best = lda_tuning.sort_values(by=['Coherence'], ascending=False).head(1)


tuned_topics = int(lda_tuning_best['Topics'].to_string(index=False))


# Since the values for Alpha and Beta can be float, symmetric and asymmetric, we will either strip or convert to float
try:
    tuned_alpha = float(lda_tuning_best['Alpha'].to_string(index=False))
except:
    tuned_alpha = lda_tuning_best['Alpha'].to_string(index=False).strip()
    

try:
    tuned_beta = float(lda_tuning_best['Beta'].to_string(index=False))
except:
    tuned_beta = lda_tuning_best['Beta'].to_string(index=False).strip()    
    
print(f'Best Parameters: Topics: {tuned_topics}, Alpha: {tuned_alpha}, Beta: {tuned_beta}')

Best Parameters: Topics: 9, Alpha: symmetric, Beta: symmetric


In [None]:
%%time

tuned_lda_model = LdaMulticore(corpus=doc_term_matrix,
                       id2word=dictionary,
                       num_topics=tuned_topics,
                       random_state=100,
                       passes=10,
                       alpha=tuned_alpha,
                       eta=tuned_beta,
                       workers = workers)

coherence_model_lda = CoherenceModel(model=tuned_lda_model, texts=data_tokens_cobnined, dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

In [26]:
%%time

lda_display = gensimvis.prepare(tuned_lda_model, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

CPU times: user 1min 10s, sys: 30.7 s, total: 1min 40s
Wall time: 2min 38s
