In [1]:
import sys
import os
import glob
home_folder = os.path.dirname(os.getcwd())
sys.path.append(home_folder)
os.environ['MALLET_HOME'] = '/mnt/packages/Mallet/bin/mallet'

import json
from src.xml_server_connect import *
from src.utility import *
from src.extract_table import *
from src.pre_process import *
from src.lda_model import *
from collections import defaultdict
import pandas as pd
import numpy as np
import re
import ftfy
import string
import unicodedata
import calendar
from multiprocessing import Pool
import nltk
import gensim
from gensim import corpora, models
from gensim.corpora import Dictionary
from gensim import corpora
import datetime
import pyLDAvis
import pyLDAvis.gensim as gensim_vis
from gensim.models import CoherenceModel

DEBUG:$HOME=/home/ubuntu
DEBUG:matplotlib data path /usr/local/anaconda/lib/python3.6/site-packages/matplotlib/mpl-data
DEBUG:loaded rc file /home/ubuntu/.config/matplotlib/matplotlibrc
DEBUG:matplotlib version 2.2.2
DEBUG:interactive is False
DEBUG:platform is linux


INFO:'pattern' package not found; tag filters are not available for English


In [2]:
input_folder = '/mnt/inputs'
dataset_folder = '/domino/dataset/raw_report'
dataset_processed_folder = '/domino/dataset/processed'
if not os.path.exists(dataset_processed_folder):
    dataset_processed_folder = '/domino/dataset/output'
lemma_text_file = 'RAM_lemma.json'
lda_dictionary_file = 'word_dictionary.txt'
corpus_file = 'RAM_lda_corpus.mm'
mallet_path = '/mnt/packages/Mallet/bin/mallet'
model_folder = '/mnt/models'
result_folder = '/mnt/results'
tmp_folder = '/tmp'

In [3]:
def save_results_to_excel(date, model, model_dict, lemma_folder, raw_text_folder, n_topics, n_words, id_map, result_folder):
    df_topic_word = pd.DataFrame(data = np.zeros((n_topics, n_words)), columns= ['word'+ str(x) for x in range(n_words)])
    for i in range(n_topics):
        df_topic_word.iloc[i] = pd.DataFrame(model.show_topic(topicid= i, topn= n_words))[0].tolist()
    df_topic_word['viz_topic_id'] = [id_map[key] for key in df_topic_word.index]
    df_topic_word.set_index('viz_topic_id', inplace=True)
    
    doc_records = []
    report_files_list = glob.glob(lemma_folder+'/*.json')
    for report_file in report_files_list:
        with open(report_file, 'r', encoding='utf8') as f:
            data = json.load(f)
            lemma_content = data['lemma_content']
            country = data['country']
            year = data['year']
        with open(os.path.join(raw_text_folder, '{0}_{1}.json'.format(country.replace(' ','_').replace('-','_'),str(year))), 'r', encoding='utf8') as f:
            raw_xml = json.load(f)['xml']
        raw_text = extract_table_content(extract_risk_table_node_from_report(raw_xml))
        
        doc_record = {}
        doc_record['year'] = year
        doc_record['country'] = country
        doc_record['text'] = raw_text
        doc_topic = model.get_document_topics(model_dict.doc2bow(lemma_content))
        for i in range(n_topics):
            doc_record['topic_%d'%id_map[i]] = 0
        for i, v in doc_topic:
            doc_record['topic_%d'%id_map[i]] = v
        doc_records.append(doc_record)
    df_doc_records = pd.DataFrame.from_records(doc_records)
    writer = pd.ExcelWriter(path = os.path.join(result_folder,'Mallet_{}_topics_{}.xlsx'.format(n_topics, date)))
    df_topic_word.to_excel(writer, 'Toipc and Key Word')
    df_doc_records.to_excel(writer, 'Document and Topic')
    return df_doc_records, df_topic_word

<h4>Read and Process Reports to LDA Corpus [list of (int,int)] and Save to inputs folder <br> (re-run all the dictionary and processed reports generation) <h4>

In [17]:
xml_report_to_lemma_doc(skip_set=set(['CurrencyName']))

Have processed 262 number of reports (out of 1925 reports)
Have processed 484 number of reports (out of 1925 reports)
Have processed 729 number of reports (out of 1925 reports)
Have processed 921 number of reports (out of 1925 reports)
Have processed 1151 number of reports (out of 1925 reports)
Have processed 1415 number of reports (out of 1925 reports)
Have processed 1689 number of reports (out of 1925 reports)


In [4]:
#lda_dict = build_dictionary(xml_report_lemma_gen(dataset_folder))
lda_dict = build_dictionary_from_files(dataset_processed_folder)
save_dictionary(os.path.join(input_folder, lda_dictionary_file), lda_dict)

lemma_corpus = []
processed_files_list = glob.glob(dataset_processed_folder + '/*.json')
total_no = len(processed_files_list)
count = 0
start = time.time()
for processed_file in processed_files_list:
    count += 1
    with open(processed_file, 'r', encoding='utf8') as f:
        lemma_content = json.load(f)['lemma_content']
    
    lemma_corpus.append(lemma_content)
    if (time.time()-start)/60 > 5:
        print('Have processed %d number of reports (out of %d reports)'%(count, len(total_no)))
        start = time.time()
              
data = {'lemma_text':lemma_corpus}
with open(os.path.join(input_folder, lemma_text_file), 'w', encoding='utf8') as f:
    json.dump(data, f)
    
corpus_gen = lda_corpus_gen_from_files(dataset_processed_folder, lda_dict)
save_corpus(corpus_gen, os.path.join(input_folder, corpus_file))

INFO:adding document #0 to Dictionary(0 unique tokens: [])
INFO:built Dictionary(239 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 1 documents (total 357 corpus positions)
INFO:adding document #0 to Dictionary(239 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(366 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 2 documents (total 583 corpus positions)
INFO:adding document #0 to Dictionary(366 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(486 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 3 documents (total 829 corpus positions)
INFO:adding document #0 to Dictionary(486 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(591 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)

INFO:built Dictionary(1876 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 30 documents (total 8670 corpus positions)
INFO:adding document #0 to Dictionary(1876 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(1922 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 31 documents (total 9198 corpus positions)
INFO:adding document #0 to Dictionary(1922 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(1934 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 32 documents (total 9428 corpus positions)
INFO:adding document #0 to Dictionary(1934 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(1959 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 33 documents (total 9797 corpus position

INFO:built Dictionary(2424 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 59 documents (total 17646 corpus positions)
INFO:adding document #0 to Dictionary(2424 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(2433 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 60 documents (total 17871 corpus positions)
INFO:adding document #0 to Dictionary(2433 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(2437 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 61 documents (total 17993 corpus positions)
INFO:adding document #0 to Dictionary(2437 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(2458 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 62 documents (total 18291 corpus posi

INFO:built Dictionary(2843 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 88 documents (total 25650 corpus positions)
INFO:adding document #0 to Dictionary(2843 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(2848 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 89 documents (total 25846 corpus positions)
INFO:adding document #0 to Dictionary(2848 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(2856 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 90 documents (total 26172 corpus positions)
INFO:adding document #0 to Dictionary(2856 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(2863 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 91 documents (total 26393 corpus posi

INFO:built Dictionary(3145 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 117 documents (total 33079 corpus positions)
INFO:adding document #0 to Dictionary(3145 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3169 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 118 documents (total 33414 corpus positions)
INFO:adding document #0 to Dictionary(3169 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3178 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 119 documents (total 33656 corpus positions)
INFO:adding document #0 to Dictionary(3178 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3209 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 120 documents (total 34152 corpus 

INFO:built Dictionary(3458 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 146 documents (total 41861 corpus positions)
INFO:adding document #0 to Dictionary(3458 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3471 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 147 documents (total 42182 corpus positions)
INFO:adding document #0 to Dictionary(3471 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3479 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 148 documents (total 42407 corpus positions)
INFO:adding document #0 to Dictionary(3479 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3484 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 149 documents (total 42671 corpus 

INFO:built Dictionary(3629 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 175 documents (total 50345 corpus positions)
INFO:adding document #0 to Dictionary(3629 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3634 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 176 documents (total 50535 corpus positions)
INFO:adding document #0 to Dictionary(3634 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3635 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 177 documents (total 50748 corpus positions)
INFO:adding document #0 to Dictionary(3635 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3658 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 178 documents (total 51082 corpus 

INFO:built Dictionary(3794 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 204 documents (total 58463 corpus positions)
INFO:adding document #0 to Dictionary(3794 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3797 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 205 documents (total 58698 corpus positions)
INFO:adding document #0 to Dictionary(3797 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3811 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 206 documents (total 58980 corpus positions)
INFO:adding document #0 to Dictionary(3811 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3817 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 207 documents (total 59146 corpus 

INFO:built Dictionary(3975 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 233 documents (total 67007 corpus positions)
INFO:adding document #0 to Dictionary(3975 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3977 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 234 documents (total 67173 corpus positions)
INFO:adding document #0 to Dictionary(3977 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3981 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 235 documents (total 67603 corpus positions)
INFO:adding document #0 to Dictionary(3981 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(3983 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 236 documents (total 67829 corpus 

INFO:built Dictionary(4111 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 262 documents (total 75378 corpus positions)
INFO:adding document #0 to Dictionary(4111 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4114 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 263 documents (total 75700 corpus positions)
INFO:adding document #0 to Dictionary(4114 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4115 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 264 documents (total 75821 corpus positions)
INFO:adding document #0 to Dictionary(4115 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4115 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 265 documents (total 76004 corpus 

INFO:built Dictionary(4236 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 291 documents (total 84467 corpus positions)
INFO:adding document #0 to Dictionary(4236 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4249 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 292 documents (total 84791 corpus positions)
INFO:adding document #0 to Dictionary(4249 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4251 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 293 documents (total 85061 corpus positions)
INFO:adding document #0 to Dictionary(4251 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4254 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 294 documents (total 85316 corpus 

INFO:built Dictionary(4362 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 320 documents (total 92845 corpus positions)
INFO:adding document #0 to Dictionary(4362 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4363 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 321 documents (total 93204 corpus positions)
INFO:adding document #0 to Dictionary(4363 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4365 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 322 documents (total 93413 corpus positions)
INFO:adding document #0 to Dictionary(4365 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4367 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 323 documents (total 93610 corpus 

INFO:built Dictionary(4462 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 349 documents (total 101910 corpus positions)
INFO:adding document #0 to Dictionary(4462 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4467 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 350 documents (total 102174 corpus positions)
INFO:adding document #0 to Dictionary(4467 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4468 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 351 documents (total 102461 corpus positions)
INFO:adding document #0 to Dictionary(4468 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4474 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 352 documents (total 102719 cor

INFO:built Dictionary(4558 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 378 documents (total 110444 corpus positions)
INFO:adding document #0 to Dictionary(4558 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4558 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 379 documents (total 110677 corpus positions)
INFO:adding document #0 to Dictionary(4558 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4561 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 380 documents (total 111057 corpus positions)
INFO:adding document #0 to Dictionary(4561 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4564 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 381 documents (total 111252 cor

INFO:built Dictionary(4647 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 407 documents (total 118080 corpus positions)
INFO:adding document #0 to Dictionary(4647 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4653 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 408 documents (total 118236 corpus positions)
INFO:adding document #0 to Dictionary(4653 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4657 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 409 documents (total 118638 corpus positions)
INFO:adding document #0 to Dictionary(4657 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4658 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 410 documents (total 118886 cor

INFO:built Dictionary(4742 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 436 documents (total 126020 corpus positions)
INFO:adding document #0 to Dictionary(4742 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4745 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 437 documents (total 126225 corpus positions)
INFO:adding document #0 to Dictionary(4745 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4748 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 438 documents (total 126520 corpus positions)
INFO:adding document #0 to Dictionary(4748 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4754 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 439 documents (total 126725 cor

INFO:built Dictionary(4843 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 465 documents (total 134889 corpus positions)
INFO:adding document #0 to Dictionary(4843 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4851 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 466 documents (total 135202 corpus positions)
INFO:adding document #0 to Dictionary(4851 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4851 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 467 documents (total 135419 corpus positions)
INFO:adding document #0 to Dictionary(4851 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4854 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 468 documents (total 135672 cor

INFO:built Dictionary(4945 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 494 documents (total 144059 corpus positions)
INFO:adding document #0 to Dictionary(4945 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4947 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 495 documents (total 144214 corpus positions)
INFO:adding document #0 to Dictionary(4947 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4949 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 496 documents (total 144370 corpus positions)
INFO:adding document #0 to Dictionary(4949 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(4961 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 497 documents (total 144834 cor

INFO:built Dictionary(5039 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 523 documents (total 152501 corpus positions)
INFO:adding document #0 to Dictionary(5039 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5041 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 524 documents (total 152882 corpus positions)
INFO:adding document #0 to Dictionary(5041 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5043 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 525 documents (total 153133 corpus positions)
INFO:adding document #0 to Dictionary(5043 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5047 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 526 documents (total 153590 cor

INFO:built Dictionary(5135 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 552 documents (total 161658 corpus positions)
INFO:adding document #0 to Dictionary(5135 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5141 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 553 documents (total 162297 corpus positions)
INFO:adding document #0 to Dictionary(5141 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5148 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 554 documents (total 162607 corpus positions)
INFO:adding document #0 to Dictionary(5148 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5150 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 555 documents (total 162968 cor

INFO:built Dictionary(5210 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 581 documents (total 169862 corpus positions)
INFO:adding document #0 to Dictionary(5210 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5213 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 582 documents (total 170351 corpus positions)
INFO:adding document #0 to Dictionary(5213 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5216 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 583 documents (total 170729 corpus positions)
INFO:adding document #0 to Dictionary(5216 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5219 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 584 documents (total 171052 cor

INFO:built Dictionary(5266 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 610 documents (total 179214 corpus positions)
INFO:adding document #0 to Dictionary(5266 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5266 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 611 documents (total 179679 corpus positions)
INFO:adding document #0 to Dictionary(5266 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5269 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 612 documents (total 179966 corpus positions)
INFO:adding document #0 to Dictionary(5269 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5269 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 613 documents (total 180134 cor

INFO:built Dictionary(5327 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 639 documents (total 188618 corpus positions)
INFO:adding document #0 to Dictionary(5327 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5327 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 640 documents (total 188760 corpus positions)
INFO:adding document #0 to Dictionary(5327 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5333 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 641 documents (total 189194 corpus positions)
INFO:adding document #0 to Dictionary(5333 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5339 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 642 documents (total 189434 cor

INFO:built Dictionary(5395 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 668 documents (total 196543 corpus positions)
INFO:adding document #0 to Dictionary(5395 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...)
INFO:built Dictionary(5395 unique tokens: ['abenomics', 'accelerate', 'achieve', 'activity', 'addition']...) from 669 documents (total 197015 corpus positions)
INFO:saving dictionary mapping to /mnt/inputs/word_dictionary.txt
INFO:storing corpus in Matrix Market format to /mnt/inputs/RAM_lda_corpus.mm
INFO:saving sparse matrix to /mnt/inputs/RAM_lda_corpus.mm
INFO:PROGRESS: saving document #0
INFO:saved 669x5395 matrix, density=3.480% (125591/3609255)
INFO:saving MmCorpus index to /mnt/inputs/RAM_lda_corpus.mm.index


It took 0.07 min to process report and build lda corpus


In [12]:
load_dict_indicator = False

if load_dict_indicator:
    lda_dict = load_dictionary(os.path.join(input_folder, lda_dictionary_file))

#corpus_gen = lda_corpus_gen(xml_report_lemma_gen(dataset_folder), lda_dict)
corpus_gen = lda_corpus_gen_from_files(dataset_processed_folder, lda_dict)
save_corpus(corpus_gen, os.path.join(input_folder, corpus_file))

INFO:storing corpus in Matrix Market format to /mnt/inputs/RAM_lda_corpus.mm
INFO:saving sparse matrix to /mnt/inputs/RAM_lda_corpus.mm
INFO:PROGRESS: saving document #0
INFO:saved 669x5395 matrix, density=3.480% (125591/3609255)
INFO:saving MmCorpus index to /mnt/inputs/RAM_lda_corpus.mm.index


It took 0.07 min to process report and build lda corpus


<h4>Load LDA Dictionary and Processed Corpus<h4>

In [11]:
with open(os.path.join(input_folder, lemma_text_file), 'r', encoding='utf8') as f:
    data = json.load(f)
    lemma_corpus = data['lemma_text']
    
lda_dict = load_dictionary(os.path.join(input_folder, lda_dictionary_file))
RAM_corpus = load_corpus(os.path.join(input_folder, corpus_file))

INFO:loaded corpus index from /mnt/inputs/RAM_lda_corpus.mm.index
INFO:initializing cython corpus reader from /mnt/inputs/RAM_lda_corpus.mm
INFO:accepted corpus with 669 documents, 5395 features, 125591 non-zero entries


<h4>Train and Tuning Hyperparameters<h4>

In [23]:
def fine_tune_lda_topics(corpus, dictionary, texts, mallet_path, start=10, end=50, step=1, alpha=1):
    """
    Compute c_v coherence for various number of topics
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    end : Max num of topics

    Returns:
    -------
    best_model :
    best_coherence_values 
    n_topics : numbmber of topics
    """
    best_coherence_value = -np.inf
    best_model = None
    n_topics = None
    for num_topics in range(start, end + 1, step):
        #print('\nTraining with n_topics = {}, training sample = {}.'.format(num_topics,len(corpus)))
        np.random.seed(seed=1)
        model = gensim.models.wrappers.LdaMallet(mallet_path=mallet_path, corpus=corpus, num_topics=num_topics, id2word=dictionary, 
                                                 alpha=alpha, optimize_interval=10, iterations = 2000, random_seed=1)
                                                 #prefix=os.path.join(model_folder,"mallet_{}_topics_".format(num_topics)))
        model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model)
        #print('Calculating coherence score based on {} samples.'.format(len(texts)))
        
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_value = coherencemodel.get_coherence()
        if coherence_value > best_coherence_value:
            best_coherence_value = coherence_value
            best_model = model
            n_topics = num_topics
        
        #print("\n{}: {}".format(num_topics,coherence_value))
        
    return best_model, best_coherence_value, n_topics

In [24]:
def fine_tune_lda_alpha(corpus, dictionary, texts, mallet_path, start_alpha=0.01, end_alpha=10, number=10, n_topics=10):
    """
    Compute c_v coherence for various number of topics
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts

    Returns:
    -------
    best_model
    best_coherence_values
    best_alpha 
    """
    best_coherence_value = -np.inf
    best_model = None
    best_alpha = None
    alphas = np.logspace(np.log10(start_alpha), np.log10(end_alpha), num=number)
    for alpha in alphas:
        np.random.seed(seed=1)
        model = gensim.models.wrappers.LdaMallet(mallet_path=mallet_path, corpus=corpus, num_topics=n_topics, id2word=dictionary, 
                                                 alpha=alpha, optimize_interval=10, iterations = 2000, random_seed=1)
                                            
        model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model)
        
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_value = coherencemodel.get_coherence()
        if coherence_value > best_coherence_value:
            best_coherence_value = coherence_value
            best_model = model
            best_alpha = alpha
        
        #print("\n{}: {}".format(num_topics,coherence_value))
        
    return best_model, best_coherence_value, best_alpha

In [25]:
res = []
min_topics = 10
max_topics = 50
step = 1
tmp_folder = '/tmp'
num_topics = range(min_topics, max_topics + 1, step)
for n_topics in num_topics:
    model, coherence_value, alpha = fine_tune_lda_alpha(RAM_corpus, lda_dict, lemma_corpus, mallet_path,\
                                                        start_alpha=0.01, end_alpha=10, number=10,\
                                                        n_topics=n_topics)
    print("\n{} topics (best alpha: {:.2f}): {:.4f}".format(n_topics, alpha, coherence_value))
    res.append((model, coherence_value, alpha, n_topics))
    #model.save(os.path.join(model_folder,'mallet_weights_{}_{:.2f}'.format(n_topics, alpha).replace('.', '_')))
    #vis_data = gensim_vis.prepare(model,corpus=RAM_corpus, dictionary=lda_dict)
    #pyLDAvis.save_html(vis_data, os.path.join(result_folder,'mallet_{}_topics_short_run.html'.format(n_topics))) 
    if len([name for name in os.listdir(tmp_folder) if os.path.isfile(os.path.join(tmp_folder, name))]) > 5000:
        [os.remove(os.path.join(tmp_folder,f)) for f in os.listdir(tmp_folder) if f.endswith(".mallet.gz") \
                                                                                  or f.endswith(".mallet") \
                                                                                  or f.endswith(".txt")]

INFO:serializing temporary corpus to /tmp/1e4cf9_corpus.txt
INFO:converting temporary corpus to MALLET format with /mnt/packages/Mallet/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /tmp/1e4cf9_corpus.txt --output /tmp/1e4cf9_corpus.mallet


CalledProcessError: Command '/mnt/packages/Mallet/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /tmp/1e4cf9_corpus.txt --output /tmp/1e4cf9_corpus.mallet' returned non-zero exit status 126.

In [26]:
best_model = None
best_score = -np.inf
best_num_topics = None
n_topics = 10
to_print = []
#print("Number of Alphas is {}".format(len(res)))
for result in res:
    if result[1] > best_score:
        best_model = result[0]
        best_score = result[1]
        best_num_topics = n_topics
    to_print.append("model with {0} topics has a coherence score of {1:.4f}".format(n_topics, result[1]))
    n_topics += 1
    
with open(os.path.join(result_folder, 'coherence_score.txt'), 'w', encoding='utf8') as f:
    f.writelines('\n'.join(to_print))

In [27]:
model = gensim.models.wrappers.LdaMallet(mallet_path=mallet_path, corpus=RAM_corpus, num_topics=37, id2word=lda_dict, 
                                                 alpha=0.46, optimize_interval=10, iterations = 2000, random_seed=1,
                                                 prefix=os.path.join(model_folder,"mallet_{}_topics_".format(37)))
best_model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model)

INFO:serializing temporary corpus to /mnt/models/mallet_37_topics_corpus.txt
INFO:converting temporary corpus to MALLET format with /mnt/packages/Mallet/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /mnt/models/mallet_37_topics_corpus.txt --output /mnt/models/mallet_37_topics_corpus.mallet


CalledProcessError: Command '/mnt/packages/Mallet/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /mnt/models/mallet_37_topics_corpus.txt --output /mnt/models/mallet_37_topics_corpus.mallet' returned non-zero exit status 126.

In [29]:
load_model = False
model_file = 'best_model_weights_37_2019_03_21'
if load_model:
    best_model = gensim.models.wrappers.LdaMallet.load(os.path.join(model_folder,model_file))
    np.random.seed(seed=1)
    model = gensim.models.wrappers.LdaMallet(mallet_path=mallet_path, corpus=RAM_corpus, num_topics=best_model.num_topics, 
                                             id2word=lda_dict, alpha=1, optimize_interval=10, iterations = 2000, random_seed=1)
                                            
    model = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(model)
    coherencemodel = CoherenceModel(model=model, texts=lemma_corpus, dictionary=lda_dict, coherence='c_v')
    coherence_value = coherencemodel.get_coherence()
    print("Coherence Value is: {}".format(coherence_value))
    pyLDAvis.enable_notebook()
    vis_data = gensim_vis.prepare(model,corpus=RAM_corpus, dictionary=lda_dict)
    vis_data

In [60]:
pyLDAvis.enable_notebook()
vis_data = gensim_vis.prepare(best_model,corpus=RAM_corpus, dictionary=lda_dict)
vis_data

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
  relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
  relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift


In [61]:
n_words = 20
id_map = dict(zip(np.array(vis_data.topic_order)- 1  , list(range(1,n_topics+1))))
import datetime
date = datetime.datetime.now()
date = date.strftime("%Y_%m_%d")
df_d, df_t = save_results_to_excel(date, best_model, lda_dict, dataset_processed_folder, dataset_folder, best_num_topics, n_words, id_map, result_folder)

In [62]:
pyLDAvis.save_html(vis_data,  os.path.join(result_folder,'best_model_{}_topics_{}.html'.format(best_num_topics, date))) 

In [None]:
best_model.save(os.path.join(model_folder,'best_model_weights_{}_{}'.format(best_num_topics, date)))

In [13]:
df_d.iloc[4]['text']

'Protracted economic and financial volatility, especially for emerging markets  (triggered by prospective exit from UMP).\nProspects of higher interest rates in advanced economies could trigger a sustained reversal of capital flows from the region, intensifying foreign currency liquidity strains.\nPressure on international reserves and the exchange rate, with effects on balance sheets, and potentially, a reacceleration of dollarization.\nAllow the currency to depreciate and refrain from domestic demand stimulus to preserve limited international reserves and contain inflation.\nLower than anticipated emerging market growth potential  (earlier maturing of the cycle and incomplete structural reforms with spillovers to LICs and advanced economies).\nGrowth relies to a large extent on trade and FDI links with other Asian countries, particularly China, Thailand and Vietnam.\nSlowdown in trading partner growth will put pressure on the balance of payments as exports decline and domestic demand

In [17]:
n_topics = 22
n_words = 20
#np.random.seed(seed=1)
lda_mallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=RAM_corpus, num_topics=n_topics,
                                              id2word=lda_dict, alpha=0.02, optimize_interval=10, 
                                              iterations = 2000,)
                                              #prefix=os.path.join(model_folder,"mallet_{}_topics_".format(n_topics)))
lda_gensim = gensim.models.wrappers.ldamallet.malletmodel2ldamodel(lda_mallet)

INFO:serializing temporary corpus to /tmp/424e7f_corpus.txt
INFO:converting temporary corpus to MALLET format with /mnt/packages/Mallet/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input /tmp/424e7f_corpus.txt --output /tmp/424e7f_corpus.mallet
INFO:training MALLET LDA with /mnt/packages/Mallet/bin/mallet train-topics --input /tmp/424e7f_corpus.mallet --num-topics 22  --alpha 0.02 --optimize-interval 10 --num-threads 4 --output-state /tmp/424e7f_state.mallet.gz --output-doc-topics /tmp/424e7f_doctopics.txt --output-topic-keys /tmp/424e7f_topickeys.txt --num-iterations 2000 --inferencer-filename /tmp/424e7f_inferencer.mallet --doc-topics-threshold 0.0  --random-seed 0
INFO:loading assigned topics from /tmp/424e7f_state.mallet.gz
INFO:using serial LDA version on this node


In [16]:
print('calculating coherence socre for {} documents ......'.format(len(lemma_corpus)))
coherence_model_lda = CoherenceModel(model=lda_gensim, texts=lemma_corpus, dictionary=lda_dict, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)

INFO:using ParallelWordOccurrenceAccumulator(processes=7, batch_size=64) to estimate probabilities from sliding windows


calculating coherence socre for 669 documents ......


INFO:1 batches submitted to accumulate stats from 64 documents (11693 virtual)
INFO:2 batches submitted to accumulate stats from 128 documents (22462 virtual)
INFO:3 batches submitted to accumulate stats from 192 documents (34558 virtual)
INFO:4 batches submitted to accumulate stats from 256 documents (45410 virtual)
INFO:5 batches submitted to accumulate stats from 320 documents (57965 virtual)
INFO:6 batches submitted to accumulate stats from 384 documents (69956 virtual)
INFO:7 batches submitted to accumulate stats from 448 documents (80844 virtual)
INFO:8 batches submitted to accumulate stats from 512 documents (93064 virtual)
INFO:9 batches submitted to accumulate stats from 576 documents (106047 virtual)
INFO:10 batches submitted to accumulate stats from 640 documents (119000 virtual)
INFO:11 batches submitted to accumulate stats from 704 documents (124094 virtual)
INFO:serializing accumulator to return to master...
INFO:accumulator serialized
INFO:serializing accumulator to retu


Coherence Score:  0.3587213739024996


In [22]:
model, coherence_value, n = fine_tune_lda(RAM_corpus, lda_dict, lemma_corpus, mallet_path, alpha=0.02)

NameError: name 'fine_tune_lda' is not defined