<a href="https://colab.research.google.com/github/PreamJ/LDA_TopicModelling_Legal/blob/main/topicmodeling_answer03_coherence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_stopwords
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
import gensim 
import gensim.corpora as corpora
from gensim import models
import pyLDAvis
from pprint import pprint
import pickle 
import os
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel

In [2]:
data = pd.read_csv('dataset/DatasetLegal.csv')
data.answer.head()

0    ดอกเบี้ยของเงินฝากสินส่วนตัวของภริยาย่อมเป็นดอ...
1    คุณสามารถร้องขอเป็นผู้จัดการมรดกของบิดาได้ส่วน...
2    เช็คลงวันที่ไม่ตรงกับวันครบกำหนดสัญญา้จะฟ้องร้...
3    คุณมีทางเลือกอยู่ประการประการแรกยื่นคำร้องเพื่...
4    คดีที่มีการฟ้องร้องแล้วและศาลอ่านคำพิพากษาเกิน...
Name: answer, dtype: object

In [3]:
str_answer = data['answer'].astype(str)
str_answer = str_answer.map(lambda x: re.sub('[,.!?#/]', '', x))
str_question = data['question'].astype(str)
str_question = str_question.map(lambda x: re.sub('[,.!?*/]', '', x))

In [4]:
sentense_token = []
for i in range(len(str_answer)):
  sentense_token.append(str_question[i])
  sentense_token.append(str_answer[i])

train_data = sentense_token[:13061]
test_data = sentense_token[13062:]

In [5]:
word_token_answer = []
for sentense in train_data:
  word = word_tokenize(sentense, engine='newmm')
  word_token_answer.append(word)

print(word_token_answer[0][:10])

['ดิฉัน', 'มีเรื่อง', 'ปรึกษา', 'เกี่ยวกับ', 'คดี', 'ครอบครัว', 'คือ', 'ว่า', 'ดิฉัน', 'พอ']


In [6]:
stopwords = list(thai_stopwords())
read_stopwords = pd.read_csv('dataset/add_stopwords.csv')
add_stopwords = read_stopwords['stopword'].values.tolist()
processed_answer = []
for sentense in word_token_answer:
  each_sentense = []
  for word in sentense:
    if(word not in stopwords + add_stopwords):
      each_sentense.append(word)
  processed_answer.append(each_sentense)
print(processed_answer[0][:8])

['มีเรื่อง', 'ปรึกษา', 'ครอบครัว', 'สินสมรส', 'สินส่วนตัว', 'การทราบ', 'ข้อเท็จจริง', 'วิธีการ']


In [8]:
id2word = corpora.Dictionary(processed_answer)
corpus = []
for text in processed_answer:
  vec = id2word.doc2bow(text)
  corpus.append(vec)

In [10]:
num_topics = range(4, 10)

alpha_values = [0.01, 0.05, 0.1, 0.5, 0.9]

beta_values = [0.01, 0.05, 0.1, 0.5, 0.9]

# Initialize lists to store coherence scores and model parameters
coherence_scores = []
topic_params = []
alpha_params = []
beta_params = []

# Loop over different combinations of topic, alpha, and beta parameter values
for t in num_topics:
    for a in alpha_values:
        for b in beta_values:
            
            # Train the LDA model with the given parameters
            lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                        id2word=id2word,
                                                        num_topics=t,
                                                        iterations=100,
                                                        chunksize=2000,
                                                        passes=10,
                                                        alpha=a,
                                                        eta=b)
            
            # Calculate coherence score using c_v coherence measure
            coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_answer, dictionary=id2word, coherence='c_v')
            coherence_score = coherence_model_lda.get_coherence()
            
            # Append the coherence score and model parameters to the lists
            coherence_scores.append(coherence_score)
            topic_params.append(t)
            alpha_params.append(a)
            beta_params.append(b)

# Find the combination of parameters that yields the highest coherence score
max_score_index = coherence_scores.index(max(coherence_scores))
optimal_topic_param = topic_params[max_score_index]
optimal_alpha_param = alpha_params[max_score_index]
optimal_beta_param = beta_params[max_score_index]

print(f"Optimal number of topics: {optimal_topic_param}")
print(f"Optimal alpha value: {optimal_alpha_param}")
print(f"Optimal beta value: {optimal_beta_param}")
print(f"Coherence score: {max(coherence_scores)}")

Optimal number of topics: 8
Optimal alpha value: 0.5
Optimal beta value: 0.9
Coherence score: 0.6792171949060689


In [11]:
lda_model.log_perplexity(corpus)

-7.4842501179549