In [None]:
# This is jupyter version of the code
# modified from github repo: treform
%pip install pyvis --user
%pip install treform --user

In [None]:
WANTED_TOPIC_COUNT = 6

In [None]:
import os
CSV_PATH = os.path.join(r'.', 'result.csv')

In [None]:
from treform.topic_model.pyTextMinerTopicModel import pyTextMinerTopicModel
import treform as ptm
import tomotopy as tp
from tqdm import tqdm
#import Komoran
from treform.tokenizer import Komoran

In [None]:
class Corpus:
    def __init__(self, textList):
        self.pair_map = {}
        self.docs = textList

    def __iter__(self):
        return self.docs.__iter__()

    def __len__(self):
        return self.docs.__len__()

In [None]:
import csv
class CorpusFromFieldDelimitedFileWithYear(Corpus):
    def __init__(self, file, doc_index=1, year_index=2, delimiter='\t'):
        array = []
        id = 0
        pair_map = {}
        # read file from csv reader
        with open(file, encoding='utf-8') as ins:
            reader = csv.reader(ins)
            for fields in reader:
                try:
                    array.append(fields[doc_index])
                    pair_map[id] = fields[year_index]
                    id += 1
                except IndexError:
                    print("out of index " + str(id))

        self.docs = array
        self.pair_map = pair_map

In [None]:
class StopwordFilterBeta:
    IN_TYPE = [list, str]
    OUT_TYPE = [list, str]

    def __init__(self, stopwords = [], file = None):
        if file:
            stopwords = stopwords + [line.strip() for line in open(file, encoding='utf-8')]
        self.stopwords = set(stopwords)
        self.stopwordsPrefix = ('http', 'https', 'ftp', 'git', 'thatt')

    def __call__(self, *args, **kwargs):
        #any(e for e in test_list if e.startswith('three') or e.endswith('four'))
        return [i for i in args[0] if len(i) > 1 and i.lower() not in self.stopwords and (i.lower().startswith(tuple(p for p in self.stopwordsPrefix)) == False)]

In [None]:
#corpus = ptm.CorpusFromFieldDelimitedFileWithYear(PathManager.get('../sample_data/sample_dmr_input.txt'),doc_index=2,year_index=1)

corpus = CorpusFromFieldDelimitedFileWithYear(CSV_PATH, doc_index= 4, year_index=2, delimiter=',')
pair_map = corpus.pair_map

In [None]:
len(corpus.docs)

In [None]:
pipeline = ptm.Pipeline(ptm.splitter.NLTK(),
                        Komoran(), # MeCab, Komoran, Okt, etc...
                        # Mecab requires manual installation! try eunjeon if you can
                        ptm.helper.POSFilter('NN*'), # Noun
                        ptm.helper.SelectWordOnly(),
                        StopwordFilterBeta(file=r'stopwordsKor_riss.txt')
                        )

In [None]:
result = pipeline.processCorpus(tqdm(corpus.docs))
text_data = []
for doc in result:
    new_doc = []
    for sent in doc:
        for _str in sent:
            if len(_str) > 0:
                new_doc.append(_str)
    text_data.append(new_doc)

In [None]:
for doc in tqdm(result):
    new_doc = []
    for sent in doc:
        for _str in sent:
            if len(_str) > 0:
                new_doc.append(_str)
    text_data.append(new_doc)

In [None]:
text_data[:11] # example of text data

In [None]:
# get memory size of text data
import sys
import os
#print(sys.getsizeof(text_data))
# pickle if smaller than 1GB
# if text_data.pickle does not exist, it will be created
# check if text_data.pickle exists
if os.path.isfile('text_data.pickle'):
    import pickle
    with open('text_data.pickle', 'rb') as f:
        text_data = pickle.load(f)
else:
    if sys.getsizeof(text_data) < 1<<30:
        import pickle
        with open('text_data.pickle', 'wb') as f:
            pickle.dump(text_data, f)

In [None]:
topic_model = pyTextMinerTopicModel()
topic_number = WANTED_TOPIC_COUNT
#dominant_topic_number = 6
#if dominant_topic_number >= topic_number:
#    dominant_topic_number = topic_number - 1

mdl=None
#mode is either lda, dmr, hdp, infer, ct, visualize, etc

mode='lda'
label = ''
if mode == 'lda':
    #LDA
    print('Running LDA')
    label='LDA'
    lda_model_name = './test.lda.bin'
    mdl=topic_model.lda_model(text_data, lda_model_name, topic_number)
    print('perplexity score ' + str(mdl.perplexity))

elif mode == 'dmr':
    #DMR
    print('Running DMR')
    label='DMR'
    dmr_model_name='./test.dmr.bin'
    mdl=topic_model.dmr_model(text_data, pair_map, dmr_model_name, topic_number)
    print('perplexity score ' + str(mdl.perplexity))

elif mode == 'hdp':
    print('Running HDP')
    label='HDP'
    hdp_model_name='./test.hdp.bin'
    mdl, topic_num=topic_model.hdp_model(text_data, hdp_model_name)
    topic_number=topic_num
    print('perplexity score ' + str(mdl.perplexity))

elif mode == 'hlda':
    print('Running HLDA')
    label='HLDA'
    hlda_model_name = './test.hlda.bin'
    mdl=topic_model.hlda_model(text_data, hlda_model_name)
    print('perplexity score ' + str(mdl.perplexity))

elif mode == 'ct':
    print('Running CT')
    label = 'CT'
    ct_model_name = './test.ct.bin'
    save_file = 'D:/python_workspace/treform/topic_network.html'
    mdl = topic_model.ct_model(text_data, ct_model_name, topic_number=topic_number, topic_network_result=save_file)

elif mode == 'infer':
    lda_model_name = './test.lda.bin'
    unseen_text='아사이 베리 블루베리 비슷하다'
    topic_model.inferLDATopicModel(lda_model_name, unseen_text)

else:
    print('No mode is selected')

In [None]:
mode = 'lda'
model_name = f'./test.{mode}.bin'
topic_number = WANTED_TOPIC_COUNT
if locals().get('topic_model') is None:
    topic_model = pyTextMinerTopicModel()

mdl = tp.LDAModel.load(model_name)
print("Model loaded")
mdl.load(model_name)
# The below code extracts this dominant topic for each sentence
# and shows the weight of the topic and the keywords in a nicely formatted output.
df_topic_sents_keywords, matrix = pyTextMinerTopicModel().format_topics_sentences(topic_number=topic_number, mdl=mdl)

In [None]:
import sys
sys.getsizeof(df_topic_sents_keywords)
if sys.getsizeof(df_topic_sents_keywords) < 1<<30:
    import pickle
    with open('df_topic_sents_keywords.pickle', 'wb') as f:
        pickle.dump(df_topic_sents_keywords, f)
else:
    print('df_topic_sents_keywords is too big to dump')
# Now dump matrix if it is smaller than 1GB
if sys.getsizeof(matrix) < 1<<30:
    import pickle
    with open('matrix.pickle', 'wb') as f:
        pickle.dump(matrix, f)
else:
    print('matrix is too big to dump')

In [None]:
import numpy as np
import pandas as pd
import matplotlib

def tSNE(mdl, matrix, label, topic_number=10):
    from bokeh.plotting import figure, output_file, show
    from bokeh.models import Label
    from bokeh.io import output_notebook
    import matplotlib.colors as mcolors
    from sklearn.manifold import TSNE

    # Array of topic weights
    arr = pd.DataFrame(matrix).fillna(0).values

    # Dominant topic number in each doc
    topic_num = np.argmax(arr, axis=1)

    # tSNE Dimension Reduction
    tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
    tsne_lda = tsne_model.fit_transform(arr)

    n_topics = topic_number
    mycolors = np.array([color for name, color in matplotlib.colors.cnames.items()])
    plot = figure(title="t-SNE Clustering of {} " + label + "Topics".format(n_topics),
                  width=900, height=700)

    plot.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])

    show(plot)
topic_model.tSNE = tSNE # monkey patching

In [None]:
def visualize_topic_model(topic_model, mdl, matrix, label=''):
    import pickle
    with open('df_topic_sents_keywords.pickle', 'rb') as f:
        df_topic_sents_keywords = pickle.load(f)
    # Format
    df_dominant_topic = df_topic_sents_keywords.reset_index()
    df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
    df_dominant_topic.head(WANTED_TOPIC_COUNT)

    # Sometimes we want to get samples of sentences that most represent a given topic.
    # This code gets the most exemplar sentence for each topic.
    dist_result_file_ = f'./dist_doc_word_count{label}.png'
    topic_model.distribution_document_word_count(df_topic_sents_keywords, df_dominant_topic, result_file=dist_result_file_)

    #When working with a large number of documents,
    # we want to know how big the documents are as a whole and by topic.
    #Let’s plot the document word counts distribution.
    dominant_result_file_ = f'./dominant_topic_word_count{label}.png'
    dominant_topic_number = 7
    topic_model.distribution_word_count_by_dominant_topic(df_dominant_topic,dominant_topic_number=dominant_topic_number, result_file=dominant_result_file_)

    # Though we’ve already seen what are the topic keywords in each topic,
    # a word cloud with the size of the words proportional to the weight is a pleasant sight.
    # The coloring of the topics I’ve taken here is followed in the subsequent plots as well.
    topic_cloud_result_file = f'./topic_word_cloud{label}.png'
    topic_number = mdl.k
    topic_model.word_cloud_by_topic(mdl, topic_number=topic_number,topic_cloud_result_file=topic_cloud_result_file)

    topic_keyword_result_file = f'./topic_keyword{label}.png'
    # Let’s plot the word counts and the weights of each keyword in the same chart.
    topic_model.word_count_by_keywords(mdl,matrix,topic_keyword_result_file=topic_keyword_result_file, topic_number=topic_number)

    topics_per_document = f'./topic_per_document{label}.png'
    topic_model.topics_per_document(mdl, start=0, end=WANTED_TOPIC_COUNT, topics_per_document=topics_per_document, topic_number=topic_number)

    #visualize documents by tSNE
    topic_model.tSNE(mdl,matrix,'',topic_number=topic_number)

    visualization_file=f'./topic_visualization{label}.html'
    topic_model.make_pyLDAVis(mdl,visualization_file=visualization_file)

In [None]:
def visualize(mode, topic_number, topic_model=None, return_matrix=False):
    model_name = f'./test.{mode}.bin'
    topic_number = WANTED_TOPIC_COUNT
    if locals().get('topic_model') is None:
        topic_model = pyTextMinerTopicModel()
    if model_name == './test.lda.bin':
        mdl = tp.LDAModel.load(model_name)
        print("Model loaded")
    elif model_name == './test.dmr.bin':
        mdl = tp.DMRModel.load(model_name)
        visual_result_file1= './dmr_line_graph.png'
        visual_result_file2 = './dmr_bar_graph.png'
        pyTextMinerTopicModel().visualizeDMR(mdl,visual_result1=visual_result_file1, visual_result2=visual_result_file2)
    elif model_name == './test.ct.bin':
        mdl = tp.CTModel.load(model_name)
        result_file = './topic_network.html'
        topic_model.visualize_ct_model(mdl, topic_network_result=result_file)
    else:
        raise Exception("Cannot visualize this model {}".format(model_name))
    if return_matrix:
        df_topic_sents_keywords, matrix = pyTextMinerTopicModel().format_topics_sentences(topic_number=topic_number, mdl=mdl)
    else:
        matrix = None
    return mdl, matrix


In [None]:
# now DMR

# load from 'text_data.pickle'
import pickle
with open('text_data.pickle', 'rb') as f:
    text_data = pickle.load(f)
corpus = CorpusFromFieldDelimitedFileWithYear(CSV_PATH, doc_index=4, year_index=2, delimiter=',')
pair_map = corpus.pair_map
mode = 'lda'
print('Running DMR')
label='DMR'
dmr_model_name='./test.dmr.bin'
mdl=pyTextMinerTopicModel().dmr_model(text_data, pair_map, dmr_model_name, WANTED_TOPIC_COUNT, iteration = 2500)
print('perplexity score ' + str(mdl.perplexity))

In [None]:
# now visualize
model, matrix = visualize(mode, WANTED_TOPIC_COUNT, mdl, return_matrix=True)

In [None]:
matrix

In [None]:
# if matrix does not exist, pickle
import pickle
import os
import sys
if not os.path.exists('matrix.pickle'):
    print('matrix does not exist, pickle')
    if sys.getsizeof(matrix) > 100000000:
        print('matrix is too big, skip')
    else:
        with open('matrix.pickle', 'wb') as f:
            pickle.dump(matrix, f)
topic_model = pyTextMinerTopicModel()
topic_model.tSNE = tSNE # monkey patching
visualize_topic_model(topic_model, mdl, matrix, label='DMR')