In [9]:

from pythainlp.tokenize import word_tokenize
from gensim.models import KeyedVectors
import numpy as np

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

import pandas as pd

from pythainlp import word_vector
from pythainlp.corpus.common import thai_stopwords
from tqdm import tqdm
import re

from pathlib import Path



In [10]:
model = KeyedVectors.load_word2vec_format('/Users/thammasorn.h/Desktop/research-assist/model/LTW2V_v1.0-window5.bin', binary=True, unicode_errors='ignore')
model_original = word_vector.WordVector(model_name="thai2fit_wv").get_model() # load thai2fit_wv from pythainlp
model_trained = KeyedVectors.load_word2vec_format('/Users/thammasorn.h/Desktop/research-assist/model/saved_model/model.bin', binary=False, unicode_errors='ignore')
seed_word_df = pd.read_csv('seed-words.csv')
seed_word_list = seed_word_df['Keyword (TH)'].tolist()
df = pd.read_pickle('word_count.pickle')
df = df[df['word'].isin(model.vocab.keys())]
df = df[df['word'].isin(model_trained.vocab.keys())]
df = df[df['word'].isin(model_original.index2word)]
df['is_stop_word'] = df['word'].apply(lambda x: x in thai_stopwords())
df = df[~df['is_stop_word']]

def clean_word(word):
    # Keep only Thai characters (assuming Unicode range \u0E00-\u0E7F covers Thai characters)
    return re.sub(r'[^ก-๙]', '', word)

df['word'] = df['word'].apply(clean_word)
df = df[df['word']!='']
df = df[df['word'].apply(lambda x: len(x)> 1)]
df = df[df['word'].isin(model.vocab.keys())]
df = df[df['word'].isin(model_original.index2word)]
df = df[df['word'].isin(model_trained.index2word)]
df['vector'] = df['word'].apply(model_trained.get_vector)

In [11]:
df.shape[0]

20108

In [12]:
result = {}
for seed_word in seed_word_list:
    if seed_word in model_trained.vocab.keys():
        df['similarity_score'] = 1 - df['word'].apply(lambda x: np.min([model_trained.distance(x, seed_word)]))
        top_similar_words = model_trained.most_similar(seed_word, topn=30)
    else:
        list_t = word_tokenize(seed_word)
        list_t = [w for w in list_t if w not in thai_stopwords()]
        vector = np.zeros_like(model_trained.get_vector('แมว'))
        for token in list_t:
            vector += model_trained.get_vector(token)
        df['similarity_score'] = df['vector'].apply(lambda x: model_trained.cosine_similarities(x, [vector])[0])
        top_similar_words = model_trained.similar_by_vector(vector, topn=30)
    top_similar_words_in_report = df.sort_values('similarity_score',ascending=False).head(30)
    result[seed_word] = {
        'top_similar_words_from_model_vocab': pd.DataFrame(top_similar_words, columns=['word','similarity_score']),
        'top_similar_words_from_dataset': top_similar_words_in_report[['word','similarity_score']].reset_index(drop=True)   
    }

In [13]:
top_similar_words_from_model_vocab = []
top_similar_words_from_dataset = []
for seed_word in seed_word_list:
    Path(f'result/nb_05/trained/{seed_word}/').mkdir(exist_ok=True, parents=True)
    for result_name in ['top_similar_words_from_model_vocab','top_similar_words_from_dataset']:
        result[seed_word][result_name].to_csv(f'result/nb_05/trained/{seed_word}/{result_name}.csv',index=False)
        if result_name == 'top_similar_words_from_model_vocab':
            result_df = result[seed_word][result_name]
            result_df['seed_word'] = seed_word
            top_similar_words_from_model_vocab += [result_df.copy()]
        else:
            result_df = result[seed_word][result_name]
            result_df['seed_word'] = seed_word
            top_similar_words_from_dataset += [result_df.copy()]

In [14]:
top_similar_words_from_dataset = pd.concat(top_similar_words_from_dataset)
top_similar_words_from_model_vocab = pd.concat(top_similar_words_from_model_vocab)

In [15]:
top_similar_words_from_dataset.to_csv('result/nb_05/trained/top_similar_words_from_dataset.csv', index=False)
top_similar_words_from_model_vocab.to_csv('result/nb_05/trained/top_similar_words_from_model_vocab.csv', index=False)