Get top key words for each recommendation cluster using latent dirichlet allocation model

In [0]:
# load data
recommd_cluster = spark.sql( ' select * from NAII.CMV_Reports_Article_Recommendations_df').toPandas()
# dfCluster = spark.sql(' select * from NAII.CMV_Reports_Article_Recommendations_dfCluster').toPandas()

In [0]:
# load stop words
import spacy
import en_core_web_sm
nlp = spacy.load("en_core_web_sm")
# nlp.Defaults.stop_words |= {"year","month","week","number","veteran","va","committee","program","minority","report", 'dear'}

local_stop_words = {"year","month","week","number","veteran","va","committee","program","minority","report",'secretary', 'member', 'subcommittee', 'department', 'meeting', 'information', 'service', 'director', 'provide', 'center', 'concern', 'affair', 'advisory', 'need', 'american', 'meet', 'establish', 'organization','second','annual'
'include', 'annex', 'work', 'plan', 'issue', 'serve', 'health', 'review', 'group', 'support', 'specific', 'care', 'office', 'page', 'day', 'continue', 'factor', 'dear',
                           'recommend', 'recommendation', 'process', 'develop', 'implement'}
# import pickle
# with open('/dbfs/NAII/CMV Reports/stopwords_by_year', 'rb') as fp:
#     stopByYear = pickle.load(fp)
    
nlp.Defaults.stop_words |= local_stop_words

In [0]:
# lemmatization
def lemmatization(reports):
    # Tags I want to remove from the text
    removal= ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE', 'NUM', 'SYM']
    stops = nlp.Defaults.stop_words
    
    summary = nlp(reports)
    proj_tok = [token.lemma_.lower() for token in summary if token.pos_ not in removal  and token.is_alpha]
#         proj_tok = [sym_spell.lookup(t, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True, ignore_token=r"\w+\d")[0].term for t in proj_tok]
    proj_tok = [w for w in proj_tok if w not in stops]
    
    return proj_tok

In [0]:
# tokenize
def tokenized_corpus(df, cluster):
    clusterIdx = [i for i, y in enumerate(df['cluster']) if y == cluster]
    reports_text = [lemmatization(df['text'][i]) for i in clusterIdx]
    id2word = corpora.Dictionary(reports_text)
    # Create Corpus
#     texts = reports_text
    # Term Document Frequency
#     id2word.filter_extremes(no_below=1, no_above=0.95, keep_n=10000)
    corpus = [id2word.doc2bow(text) for text in reports_text]
    return reports_text, corpus, id2word

In [0]:
# fit lda model
def fit_lda(corpus, id2word, num_topics, alpha, eta, offset):
    lda_model = gensim.models.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       iterations = 300,
                                       chunksize = 2000,
                                       alpha=alpha, 
                                       eta=eta, 
                                       decay=0.5, 
                                       offset=offset, 
                                       eval_every=10,
                                       gamma_threshold=0.001, 
                                       minimum_probability=0.01, 
                                       random_state= 24, 
                                       minimum_phi_value=0.01, 
                                       per_word_topics=False)
    return lda_model

In [0]:
# get coherence score
def get_coherence(lda_model, reports_text, id2word):
    coherenceModelLda = CoherenceModel(model=lda_model, corpus = corpus, dictionary=id2word, coherence='u_mass')
#     coherenceModelLda = CoherenceModel(model=lda_model, texts = reports_text, dictionary=id2word, coherence='c_uci')
#     coherenceModelLda = CoherenceModel(model=lda_model, texts = reports_text, dictionary=id2word, coherence='c_v')
    coherenceLda = coherenceModelLda.get_coherence()
    return coherenceLda

In [0]:
# parameters to be tuned:
def create_parameter_grid():
    # number of topic
    min_topics = 5
    max_topics = 15
    step_size = 3
    topic_n = range(min_topics, max_topics, step_size)

    # Alpha
    alpha = list(np.arange(0.01, 1, 0.2))
    alpha.append('symmetric')
    alpha.append('asymmetric')
    #alpha.append('auto')
    
    # eta
    eta = list(np.arange(0.01, 1, 0.2))
    eta.append('symmetric')
    #eta.append('auto')

    # offset
    offset = [1, 1.25, 1.5, 2]
    return topic_n, alpha, eta, offset

Parameter tuning for LDA <br>
Use conherence score as the standard for choosing the best parameter values

In [0]:
import tqdm
import numpy as np
import pandas as pd
from gensim.models.coherencemodel import CoherenceModel
import gensim
import gensim.corpora as corpora
model_results = { 'alpha':[], 'eta':[], 'offset':[], 'coherence':[], 'cluster': [],  'topic_terms': [], 'topic_terms_word_prob': []}

In [0]:
# start parameter tuning
topic_n, alpha, eta, offset = create_parameter_grid()
clusters = recommd_cluster['cluster'].unique()
iters = 0
# pbar = tqdm.tqdm(total=len(alpha)*len(eta)*len(offset)*len(clusters))

for clu in clusters:
    reports_text, corpus, id2word = tokenized_corpus(recommd_cluster, clu)
    t_n = 1
    alp_m = 0.0
    e_m = 0.0
    off_m = 0.0
    lda_m = None
    topic_terms_word = None
    topic_terms_prob = None
    coherence = -99999999
    
    for alp in alpha:
        for e in eta:
            for off in offset:  
                lda_model = fit_lda(corpus, id2word, t_n, alp, e, off)
                coherence_score = get_coherence(lda_model, reports_text, id2word)
                iters += 1
                if coherence_score > coherence:
                    # update 
                    coherence = coherence_score
                    alp_m = alp
                    e_m = e
                    off_m = off
                    topic_terms = lda_model.get_topic_terms(0, 20) # topic 0, first 20 terms
#                     topic_terms_word = [] 
                    for i in range(len(topic_terms)):
                        topic_terms_word = [id2word[idt[0]] for idt in topic_terms]
                        topic_terms_word_prob = [idt[1] for idt in topic_terms]
#                             model = copy.deepcopy(lda_model)
#                 pbar.update(1)

    
#     print(iters)   
    model_results['alpha'].append(alp_m)
    model_results['eta'].append(e_m)
    model_results['offset'].append(off_m)
    model_results['coherence'].append(coherence)
    model_results['cluster'].append(clu)
    model_results['topic_terms'].append(topic_terms_word)
    model_results['topic_terms_word_prob'].append(topic_terms_word_prob)
# pbar.close()

In [0]:
# save model
with open('/dbfs/NAII/CMV Reports/recommendation_clusters_LDA', 'wb') as fp:
    pickle.dump(model_results, fp)
# with open('/dbfs/NAII/CMV Reports/recommendation_clusters_LDA', 'rb') as fp:
#     model_results = pickle.load(fp)

In [0]:
topTermsDF = pd.DataFrame({
    'cluster': np.repeat(model_results['cluster'], [len(i) for i in model_results['topic_terms']]),
    'topTerms': [j for i in model_results['topic_terms'] for j in i],
    'termsProb': [j for i in model_results['topic_terms_word_prob'] for j in i]
})

In [0]:
display(topTermsDF)

cluster,topTerms,termsProb
26,island,0.04290442
26,virgin,0.038618177
26,juan,0.025759883
26,outreach,0.025759744
26,benefit,0.02575961
26,san,0.025759414
26,vamc,0.021473452
26,address,0.017187558
26,equipment,0.017187517
26,st,0.017187454


In [0]:
# write to delta
# write_to_delta(spark.createDataFrame(topTermsDF) , 'NAII.CMV_Reports_Article_Recommendations_clusterLDA')