In [61]:

from pythainlp.tokenize import word_tokenize
from gensim.models import KeyedVectors
import numpy as np

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

import pandas as pd

from pythainlp import word_vector
from tqdm import tqdm



# Prepare Dataset

- Read Model and Data

In [35]:
all_words = pd.read_pickle('word_count.pickle')

In [36]:
model = word_vector.WordVector(model_name="thai2fit_wv").get_model() # load thai2fit_wv from pythainlp

- Filter Out the words that is not in dictionary

In [None]:
thai2dict = {}
for word in model.index2word:
    thai2dict[word] = model[word]
thai2vec = pd.DataFrame.from_dict(thai2dict,orient='index')
thai2vec.head(10)

In [38]:
all_words = all_words[all_words['word'].isin(thai2vec.index)]

In [None]:
all_words[all_words['word']=='ทุจริต']

- Remove stop words (e.g. และ หรือ )

In [40]:
all_words = all_words[~all_words['is_stop_word']]

# Word2Vec

- Test Word

In [None]:
model.distance('เรียน','ศึกษา')

In [None]:
model.distance('เรียน','เกเร')

In [None]:
1 - model.cosine_similarities(model.get_vector('เรียน'), [model.get_vector('ศึกษา')])

In [None]:
model.cosine_similarities(model.get_vector('ความคิด') + model.get_vector('สร้างสรรค์'), [model.get_vector('ความคิดสร้างสรรค์')])

In [None]:
model.cosine_similarities(model.get_vector('พระราชา'), [model.get_vector('ราชินี')])

In [None]:
model.cosine_similarities(model.get_vector('ผู้หญิง'), [model.get_vector('ราชินี')])

In [None]:
model.cosine_similarities(model.get_vector('พระราชา') + model.get_vector('ผู้หญิง'), [model.get_vector('ราชินี')])

- Set Seed Words

In [48]:
import json
seed_word_dict = json.load(open('thai_seed_words.json','r'))
culture_list = []
for culture in seed_word_dict:
    culture_list += [{
        "culture": culture['culture'],
        "seed_words": [w['word'] for w in culture['seed_words']]
    }]

In [49]:
culture = culture_list[0]
for culture in culture_list:
    all_words[culture['culture']] = 1 - all_words['word'].apply(lambda x: np.min([model.distance(x, w) for w in culture['seed_words']]))

In [None]:
all_words.sort_values(culture['culture'], ascending=False).head(10)

In [51]:
all_words.to_pickle('word2vec_result.pickle')

# Example Word2Vec Result

- Example Top Related Keyword of "นวัตกรรม" (value in column "นวัตกรรม" indicates the similarity score between the word "นวัตกรรม" and the word in "word" column)

In [96]:
all_words.sort_values('นวัตกรรม', ascending=False).head(20)

Unnamed: 0,word,cnt,is_stop_word,นวัตกรรม,ธรรมาภิบาล,คุณภาพ,เคารพ,ทีม
1446624,นวัตกรรม,31967,False,1.0,0.354299,0.319564,0.29439,0.23319
1492583,ริเริ่ม,3364,False,1.0,0.3273,0.419641,0.282573,0.433864
1468442,พัฒนา,183306,False,1.0,0.24669,0.232528,0.131518,0.305168
1459590,ประดิษฐ์,2448,False,1.0,0.161473,0.194929,0.112031,0.156401
1516139,สร้างสรรค์,12772,False,1.0,0.325576,0.351208,0.318452,0.26754
1557472,เทคโนโลยี,76878,False,1.0,0.152743,0.175186,0.107813,0.149807
1396247,คิดค้น,3575,False,0.617744,0.242781,0.336439,0.225612,0.245349
1518826,สิ่งประดิษฐ์,221,False,0.58568,0.348573,0.273846,0.353244,0.233263
1394221,ความคิดสร้างสรรค์,1854,False,0.573561,0.525808,0.411409,0.558454,0.2858
1505407,วางรากฐาน,621,False,0.571945,0.423473,0.497671,0.336743,0.379857


- Example Top Related Keyword of "ธรรมาภิบาล"

In [97]:
all_words.sort_values('ธรรมาภิบาล', ascending=False).head(20)

Unnamed: 0,word,cnt,is_stop_word,นวัตกรรม,ธรรมาภิบาล,คุณภาพ,เคารพ,ทีม
1485210,ยุติธรรม,17608,False,0.231338,1.0,0.253447,0.431847,0.351741
1415749,ซื่อสัตย์,2959,False,0.325576,1.0,0.472141,0.516447,0.362181
1442748,ธรรมาภิบาล,14721,False,0.165712,1.0,0.157902,0.231245,0.146935
1491217,รับผิดชอบ,90540,False,0.3273,1.0,0.379834,0.200453,0.332295
1403920,จริยธรรม,55505,False,0.354299,1.0,0.305551,0.477577,0.348913
1576745,โปร่งใส,75994,False,0.290011,1.0,0.326442,0.343097,0.258747
1451862,น่าเชื่อถือ,3850,False,0.278831,1.0,0.376555,0.307071,0.173026
1525862,หลักวิชาการ,541,False,0.167206,0.99916,0.155584,0.222078,0.138438
1485220,ยุทธภัณฑ์,153,False,0.170214,0.999113,0.159838,0.234494,0.141927
1469296,พิธีรีตอง,7,False,0.168915,0.999072,0.159665,0.233893,0.150203


- Example Top Related Keyword of "ทีม"

In [100]:
all_words.sort_values('ทีม', ascending=False).head(20)

Unnamed: 0,word,cnt,is_stop_word,นวัตกรรม,ธรรมาภิบาล,คุณภาพ,เคารพ,ทีม
1518368,สามัคคี,2100,False,0.305168,0.362181,0.218562,0.451115,1.0
1494266,ร่วมกับ,49859,False,0.281459,0.248762,0.117017,0.182443,0.574215
1580795,ให้ความร่วมมือ,8434,False,0.434687,0.507503,0.502668,0.341481,0.567438
1404531,จับมือ,417,False,0.338608,0.263883,0.455466,0.347381,0.534257
1474677,ภูมิใจ,607,False,0.253844,0.375428,0.322116,0.330857,0.533006
1563843,เผชิญหน้า,48,False,0.238341,0.295074,0.363754,0.235718,0.525496
1394502,ความสามัคคี,1698,False,0.385146,0.446978,0.218038,0.580703,0.504299
1452388,น้ำใจ,441,False,0.286995,0.543832,0.377213,0.74743,0.504222
1489785,รวมกลุ่ม,2313,False,0.426129,0.265762,0.340673,0.227242,0.499722
1438530,ทำสัญญา,4552,False,0.414734,0.291991,0.457581,0.220137,0.496972


# Summarize Result

In [52]:
threshold_dict = {
    'นวัตกรรม': 0.4,
    'ธรรมาภิบาล': 0.4,
    'คุณภาพ': 0.4,
    'เคารพ': 0.4,
    'ทีม': 0.4
}

In [53]:
df = pd.read_pickle('entriesOct.pickle')

In [None]:
row_result_dict_list = []
for i in tqdm(range(df.shape[0])):
    row = df.iloc[i]
    row_result_df = pd.DataFrame(row['Tokenized'], columns=['word']).merge(all_words, how='left').fillna(0)
    row_result_dict = {}
    for culture, threshold in threshold_dict.items():
        row_result_dict[f'n_matched_keyword_{culture}'] = (row_result_df[culture]>threshold).sum()
        row_result_dict[f'pct_matched_keyword_{culture}'] = (row_result_df[culture]>threshold).sum()/row_result_df.shape[0]
        row_result_dict[f'avg_score_on_keyword_{culture}'] = row_result_df[culture].mean()
        row_result_dict[f'avg_score_on_matched_keyword_{culture}'] = row_result_df[row_result_df[culture]>threshold][culture].mean()
    row_result_dict_list += [row_result_dict]

In [69]:
result_df = pd.DataFrame(row_result_dict_list)

In [80]:
summarized_result_df = pd.concat([df.reset_index(drop=True), result_df], axis=1).fillna(0)

In [81]:
df.shape, result_df.shape, summarized_result_df.shape

((22278, 9), (22278, 20), (22278, 29))

- Company that mentioned  นวัตกรรม (innovation)

In [93]:
culture = 'นวัตกรรม'
summarized_result_df.groupby('Symbol')[f'pct_matched_keyword_{culture}'].mean().reset_index().sort_values(f'pct_matched_keyword_{culture}', ascending=False).head(10)

Unnamed: 0,Symbol,pct_matched_keyword_นวัตกรรม
580,SAT,0.03498
198,DIF,0.033291
26,AIT,0.032246
529,PT,0.028912
369,LEE,0.026866
646,SPI,0.026192
173,CPN,0.025966
1,3K-BAT,0.024516
386,LVT,0.023946
831,VCOM,0.023798


- Example Top Related Keyword of ""