In [1]:
import pandas as pd
import numpy as np
import re
import gensim
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_stopwords
from wordcloud import WordCloud
from gensim import corpora, models, similarities
import pyLDAvis
from pprint import pprint
import pickle 
import os
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
from gensim.test.utils import datapath
import random
from gensim.models.ldamodel import LdaModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import MmCorpus
import mymodule

In [2]:
data = pd.read_csv('dataset/DatasetLegal.csv')
str_answer = data['answer'].astype(str)
str_answer = str_answer.map(lambda x: re.sub('[,.!?#/]', '', x))
str_question = data['question'].astype(str)
str_question = str_question.map(lambda x: re.sub('[,.!?*/]', '', x))

In [3]:
sentense_token = []
for i in range(len(str_answer)):
  sentense_token.append(str_question[i])
  sentense_token.append(str_answer[i])

train_data = sentense_token[:13062]
test_data = sentense_token[13061:]

In [4]:
word_token_answer = []
for sentense in train_data:
  word = word_tokenize(sentense, engine='newmm')
  word_token_answer.append(word)

In [5]:
stopwords = list(thai_stopwords())
read_stopwords = pd.read_csv('dataset/add_stopwords.csv')
add_stopwords = read_stopwords['stopword'].values.tolist()
processed_answer = []
for sentense in word_token_answer:
  each_sentense = []
  for word in sentense:
    if(word not in stopwords + add_stopwords):
      each_sentense.append(word)
  processed_answer.append(each_sentense)
print(processed_answer[0][:8])

['มีเรื่อง', 'ปรึกษา', 'ครอบครัว', 'สินสมรส', 'สินส่วนตัว', 'การทราบ', 'ข้อเท็จจริง', 'วิธีการ']


In [6]:
id2word = corpora.Dictionary(processed_answer)
# print(id2word)
with open('model/id2word.pkl', 'wb') as f:
    pickle.dump(id2word, f)

In [7]:
corpus = []
for text in processed_answer:
  vec = id2word.doc2bow(text)
  corpus.append(vec)
MmCorpus.serialize('model/corpus.mm', corpus)

In [8]:
num_topics = 6
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics,
                                       iterations=100,
                                       chunksize=800,
                                       passes=5,
                                       alpha=0.9,
                                       eta=0.5
                                       )
with open('model/lda_model.pkl', 'wb') as f:
    pickle.dump(lda_model, f)

In [10]:
lda_model.show_topics(num_topics=6, num_words=10)

[(0,
  '0.021*"เงิน" + 0.018*"รถ" + 0.015*"จ่าย" + 0.015*"บาท" + 0.012*"สัญญา" + 0.012*"บริษัท" + 0.010*"แจ้ง" + 0.010*"เพื่อน" + 0.010*"ซื้อ" + 0.009*"ขาย"'),
 (1,
  '0.032*"ศาล" + 0.016*"ฟ้อง" + 0.015*"เจ้าหนี้" + 0.014*"ชำระหนี้" + 0.013*"ตำรวจ" + 0.011*"ลูกหนี้" + 0.010*"หนี้" + 0.010*"จำคุก" + 0.009*"บาท" + 0.008*"ทนาย"'),
 (2,
  '0.026*"สามี" + 0.024*"แม่" + 0.024*"บ้าน" + 0.021*"ลูก" + 0.020*"แฟน" + 0.020*"พ่อ" + 0.016*"ภรรยา" + 0.011*"บุตร" + 0.010*"เด็ก" + 0.009*"หย่า"'),
 (3,
  '0.057*"ที่ดิน" + 0.019*"โอน" + 0.017*"แบ่ง" + 0.017*"บ้าน" + 0.016*"สิทธิ" + 0.014*"ทรัพย์สิน" + 0.013*"มรดก" + 0.012*"ผู้จัดการมรดก" + 0.011*"ทายาท" + 0.011*"ขาย"'),
 (4,
  '0.035*"นายจ้าง" + 0.030*"บริษัท" + 0.030*"ลูกจ้าง" + 0.024*"ทำงาน" + 0.017*"จ่าย" + 0.014*"งาน" + 0.013*"สิทธิ" + 0.013*"พนักงาน" + 0.012*"เลิกจ้าง" + 0.011*"กรณี"'),
 (5,
  '0.047*"มาตรา" + 0.018*"สิทธิ" + 0.017*"โจทก์" + 0.016*"ปพพ" + 0.016*"ศาล" + 0.016*"บุตร" + 0.015*"สามี" + 0.010*"ภริยา" + 0.010*"ตามกฎหมาย" + 0.010*"จำเลย"'

In [11]:
topic_dict = {
    0 : "Contract",
    1 : "Labor",
    2 : "Undefine :)",
    3 : "Personal Right",
    4 : "Family",
    5 : "Succession",
}

with open('model/topic_dict.pkl', 'wb') as f:
    pickle.dump(topic_dict, f)


In [None]:
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(num_topics))

LDAvis_data_filepath
if 1 == 1:
    LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

The sections below are in the process of being updated with new documents.

In [None]:
new_doc = random.choice(test_data)
print(new_doc)
test_doc = mymodule.preprocess(new_doc)

In [None]:
lda_model = LdaModel.load('model/lda_model.pkl')
new_doc_topics = lda_model.get_document_topics(test_doc)
new_doc_topics_dict = {topic_dict[topic]: prob for topic, prob in new_doc_topics}
print(new_doc_topics_dict)

In [None]:
corpus = gensim.corpora.MmCorpus('model/corpus.mm')
data = pd.read_csv('dataset/DatasetLegal.csv')
with open('model/id2word.pkl', 'rb') as f:
  id2word = pickle.load(f)
corpus_lda = lda_model[corpus]
index = similarities.MatrixSimilarity(corpus_lda, num_features=len(id2word))
sims = index[new_doc_topics]
print(type(sims))
sims_sorted = sorted(enumerate(sims), key=lambda item: -item[1])
print(f"Topic distribution for new document : {new_doc_topics}\n{new_doc}\n")
for doc_id, similarity in sims_sorted[:5]:
    print(f"Document ID: {doc_id}, Similarity score: {similarity}")
    print(data.answer[doc_id])
    print("Topic distribution for similar document : ")
    for num, dis in corpus_lda[doc_id]:
      print(f"\t({topic_dict.get(num)}, {'%.5f' %dis})")
    #print("\n")
    #print(f"Topic distribution for similar document : \n{corpus_lda[doc_id]}\n")