In [1]:
import sys, os
sys.path.append(os.path.abspath(os.path.join('..', 'module')))
from openTable import *
from filepath import *

import warnings
warnings.filterwarnings('ignore')

# import gensim
from gensim.models.ldamodel import LdaModel
from gensim.models import Phrases
from gensim import corpora

from ast import literal_eval
from pickle import dump

from gensim.models.coherencemodel import CoherenceModel

import pandas as pd
from json import loads

# import spacy
from spacy.lang.id import Indonesian
nlp = Indonesian()  # use directly
stopwords = spacy.lang.id.stop_words.STOP_WORDS 
stopwords |= {"nya","jurusan","jurus","the","of"}

In [6]:
def get_data():
    data = open_table(['entryId','content'],'BlogsEntry')
    
    return data

def get_best_topic(dictionary, corpus, texts, limit, start):
    coherence_values = []
    model_list = []
    
    for num_topics in range(start, limit):
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=666)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        
    #get best model
    max_value = max(coherence_values)
    max_index = coherence_values.index(max_value)
    best_model = model_list[max_index]
        
    return best_model

def make_corpus(data):
    #Make list of list
    mylist = []

    for i,j in data.iterrows():
        tmp = literal_eval(j.content)
        mylist.append(tmp)

    # Add bigrams and trigrams to docs,minimum count 10 means only that appear 10 times or more.
    bigram = Phrases(mylist, min_count=10)
    for idx in range(len(mylist)):
        for token in bigram[mylist[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                mylist[idx].append(token)

    # Create Dictionary
    dictionary = corpora.Dictionary(mylist)

    # Term Document Frequency
    corpus = [dictionary.doc2bow(text) for text in mylist]
    
    dump(corpus, open('corpus_LDA.pkl', 'wb'))
    dictionary.save('dictionary_LDA.gensim')
    
    return mylist,dictionary,corpus
    
def save_model(model):
    #Save Model
    model.save('lda.h5')

In [3]:
# #Load Clean data
# data = pd.read_csv('data_berita_clean.csv')

#get data
data = get_data()
data = rename_column(data,{0:'entryId', 1:'content'})
data.content = data.content.apply(preprocessing)

#make corpus
mylist,dictionary,corpus = make_corpus(data)

#search optimal topic number (5)
start=3
limit=51
best_model = get_best_topic(dictionary, corpus=corpus, texts=mylist, start=start, limit=limit)

# x = range(start, limit)
# plt.plot(x, coherence_values)
# plt.xlabel("Num Topics")
# plt.ylabel("Coherence score")
# plt.legend(("coherence_values"), loc='best')
# plt.show()
save_model(best_model)

In [8]:
# Print the Keyword in the 10 topics
topics = best_model.print_topics(num_words=10)
for topic in topics:
    print(topic)

(0, '0.009*"orang" + 0.008*"universitas" + 0.008*"ilmu" + 0.007*"kerja" + 0.007*"kuliah" + 0.007*"mahasiswa" + 0.006*"ajar" + 0.005*"milik" + 0.005*"sobat" + 0.005*"didik"')
(1, '0.010*"bahasa" + 0.008*"ajar" + 0.008*"universitas" + 0.008*"orang" + 0.007*"mahasiswa" + 0.006*"sobat" + 0.006*"kuliah" + 0.006*"ilmu" + 0.005*"kerja" + 0.005*"teknik"')
(2, '0.014*"ajar" + 0.010*"teknik" + 0.008*"orang" + 0.008*"kuliah" + 0.006*"kerja" + 0.006*"ilmu" + 0.005*"sobat" + 0.005*"sekolah" + 0.005*"universitas" + 0.005*"salah"')
(3, '0.018*"orang" + 0.010*"ajar" + 0.008*"kerja" + 0.008*"kuliah" + 0.006*"sobat" + 0.005*"teknik" + 0.005*"mahasiswa" + 0.005*"milik" + 0.005*"ilmu" + 0.005*"program"')
