In [2]:
import pandas as pd
import numpy as np
import re
import gensim
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_stopwords
from wordcloud import WordCloud
from gensim import corpora, models, similarities
import pyLDAvis
from pprint import pprint
import pickle 
import os
import matplotlib.pyplot as plt
from gensim.models import CoherenceModel
from gensim.test.utils import datapath
import random
from gensim.models.ldamodel import LdaModel
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from gensim.corpora import MmCorpus
import csv
import pyLDAvis.gensim_models as gensimvis
from sklearn import metrics
RANDOM_STATE = 1

In [3]:
data = pd.read_csv('dataset/DatasetLegal.csv')
with open('preprocessing/train_corpus.pkl', 'rb') as f:
  train_corpus = pickle.load(f)
with open('preprocessing/test_corpus.pkl', 'rb') as f:
  test_corpus = pickle.load(f)
with open('preprocessing/id2word.pkl', 'rb') as f:
  id2word = pickle.load(f)
with open('preprocessing/train_data.pkl', 'rb') as f:
  train_data = pickle.load(f)
lda_models = {}
for i in range(5,31):
  with open(f'ldamodel/lda_model_{i}.pkl', 'rb') as f:
    lda_models[i] = pickle.load(f)
question_corpus = []
for text in train_data['question'].values.tolist():
  vec = id2word.doc2bow(text)
  question_corpus.append(vec)
answer_corpus = []
for text in train_data['answer'].values.tolist():
  vec = id2word.doc2bow(text)
  answer_corpus.append(vec)
accuracy_file = "evaluation/accuracy_file.csv"
cnfMx_file = 'evaluation/cnfMx_file.csv'

In [4]:
def tag(num_topics):
    lda_model = lda_models[num_topics]
    threshold = 1/num_topics
    pred_question_score = [lda_model[text] for text in question_corpus]
    pred_answer_score = [lda_model[text] for text in answer_corpus]
    question_predict=[]
    for each_topic in pred_question_score:
        temp_pred = []
        for topic in each_topic:
            if(topic[1]>threshold): temp_pred.append(1)
            else: temp_pred.append(0)
        question_predict.append(temp_pred)
    pd.DataFrame(question_predict)
    answer_predict=[]
    for each_topic in pred_answer_score:
        temp_pred = []
        for topic in each_topic:
            if(topic[1]>threshold) : temp_pred.append(1)
            else: temp_pred.append(0)
        answer_predict.append(temp_pred)
    pd.DataFrame(answer_predict)
    # return [question_predict, answer_predict] which are tags of documents
    return question_predict, answer_predict

In [4]:
def accuracy(num_topics, arr):
    '''
    arr[1] = tag from answer = true
    arr[0] = tag from question = pred
    '''
    y_true = np.array(arr[1])
    y_pred = np.array(arr[0])
    topic_true = {}
    topic_pred = {}
    for i in range(num_topics):
        topic_true[i] = [sublist[i] for sublist in y_true]
        topic_pred[i] = [sublist[i] for sublist in y_pred]
    results = []
    acc = []
    for i in range(num_topics):
        accuracy = metrics.accuracy_score(topic_true[i], topic_pred[i])
        results.append(f"{accuracy:.2f}")
        acc.append(accuracy)
    avg_acc = sum(acc)/num_topics
    results.append(avg_acc)
    with open(accuracy_file, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        # csv_writer.writerow([f'{num_topics}Topic', 'Accuracy'])
        csv_writer.writerow(results)
    print(f"Accuracy have been written.")

In [5]:
def cnfMx(num_topics, arr):
    '''
    arr[1] = tag from answer = true
    arr[0] = tag from question = pred
    '''
    confustion_matrics = metrics.multilabel_confusion_matrix(arr[1], arr[0])
    classification_report = metrics.classification_report(arr[1], arr[0])
    with open(cnfMx_file, 'a', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        # csv_writer.writerows([f'{num_topics}topics'])
        csvfile.write(classification_report)
        print(f"Classification report has been written.")

In [None]:
for i in range(5,31):
    arr = tag(i)
    accuracy(i, arr)
    cnfMx(i, arr)
    print(f"report haas been written {i}")

search evaluation

In [5]:
def create_corpus(list_data):
    corpus = []
    for text in list_data:
        vec = id2word.doc2bow(text)
        corpus.append(vec)
    return corpus

In [6]:
from gensim.similarities import MatrixSimilarity
def find_sim(corpus, query):
    '''
    ex.
    query = [(1,1),(2,1),(3,2)]
    corpus = [[(1,1),(2,1),(3,1)], [(2,1),(4,1),(5,1)], [(1,2), (5,1), (6,1)]]
    corpus = train_corpus
    '''
    similarity_index = MatrixSimilarity(corpus)
    similarities = similarity_index[query]
    sorted_similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
    for doc_index, similarity in sorted_similarities:
        print(f"Document {doc_index + 1}: Similarity = {similarity:.4f}")

In [7]:
def preprocess(text):
    stopwords = list(thai_stopwords())
    read_stopwords = pd.read_csv('dataset/add_stopwords.csv')
    add_stopwords = read_stopwords['stopword'].values.tolist()
    result = []
    str_text = str(text).replace(' ','')
    word_token = word_tokenize(str_text, engine='newmm')
    for word in word_token:
        if(word not in stopwords + add_stopwords):
            result.append(word)
    return result

In [8]:
a = data['question'][0]
a = preprocess(a)
vec = id2word.doc2bow(a)

In [9]:
similarity_index = MatrixSimilarity(train_corpus)
similarities = similarity_index[vec]
sorted_similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
for doc_index, similarity in sorted_similarities:
    print(f"Document {doc_index + 1}: Similarity = {similarity:.4f}")

Document 1: Similarity = 1.0000
Document 5626: Similarity = 0.5798
Document 7630: Similarity = 0.5507
Document 7293: Similarity = 0.5339
Document 5945: Similarity = 0.4935
Document 6711: Similarity = 0.4905
Document 7428: Similarity = 0.4802
Document 5700: Similarity = 0.4780
Document 7355: Similarity = 0.4714
Document 7217: Similarity = 0.4681
Document 6106: Similarity = 0.4640
Document 7323: Similarity = 0.4578
Document 2034: Similarity = 0.4562
Document 5809: Similarity = 0.4556
Document 7604: Similarity = 0.4547
Document 7715: Similarity = 0.4449
Document 1991: Similarity = 0.4386
Document 1788: Similarity = 0.4384
Document 1654: Similarity = 0.4379
Document 7416: Similarity = 0.4342
Document 1667: Similarity = 0.4326
Document 2346: Similarity = 0.4326
Document 8886: Similarity = 0.4202
Document 190: Similarity = 0.4200
Document 1947: Similarity = 0.4197
Document 10884: Similarity = 0.4182
Document 729: Similarity = 0.4153
Document 9154: Similarity = 0.4145
Document 8113: Similarit