In [112]:

from pythainlp.tokenize import word_tokenize
from gensim.models import KeyedVectors
import numpy as np

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

import pandas as pd

from pythainlp import word_vector
from tqdm import tqdm



# Prepare Dataset

- Read Model and Data

In [113]:
all_words = pd.read_pickle('word_count.pickle')

In [114]:
model = word_vector.WordVector(model_name="thai2fit_wv").get_model() # load thai2fit_wv from pythainlp

- Filter Out the words that is not in dictionary

In [115]:
thai2dict = {}
for word in model.index2word:
    thai2dict[word] = model[word]
thai2vec = pd.DataFrame.from_dict(thai2dict,orient='index')
thai2vec.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
ที่,0.308956,-0.097699,0.116745,0.215612,0.015768,-0.064163,0.062168,0.039649,0.86494,0.846904,...,-0.142418,0.033241,0.171581,-0.624864,-0.009358,0.449131,0.12013,-0.122195,-0.450617,-0.071318
และ,0.010751,-0.618971,0.129665,0.03546,-0.00756,0.027607,0.397824,0.026543,0.254075,0.168328,...,-0.105786,0.18093,-0.10163,0.070885,-0.037263,0.183606,-0.049088,-0.672288,-1.293044,0.592576
เป็น,-0.015736,-0.258926,0.052953,0.153728,-0.005985,-0.021081,0.041088,0.057312,1.63323,0.442729,...,-0.009408,-0.252576,-0.305512,0.372542,0.049151,0.56847,0.266586,0.4008,-0.78465,0.197369
ของ,-0.189711,-0.174774,0.171124,-0.186771,0.054294,-0.11415,-1.109456,-0.094466,-0.447015,0.042377,...,-0.168676,-0.148738,0.680404,0.097702,0.02027,0.182967,-0.083949,0.006287,-0.707434,-0.070234
มี,-0.156962,-0.231863,0.080312,0.323157,0.215695,0.055145,0.420794,0.016842,0.256759,0.832864,...,-0.044267,-0.147186,-0.105424,0.907078,0.009299,0.550953,0.139337,0.031696,-0.670379,-0.008048
ได้,-0.428813,-0.031194,0.041922,-0.036608,-0.008106,0.07647,-0.78227,0.033361,0.606864,0.44052,...,0.024458,-0.025031,0.103389,-0.078255,0.034323,0.459774,-0.748643,0.337775,-0.487408,-0.511535
"""""""""",-0.28771,0.064193,0.205076,0.146356,-0.071343,-0.039451,-1.845461,0.163763,1.018096,0.272786,...,0.051024,-0.532856,-0.131856,-0.090323,-0.058895,0.151262,-0.420358,0.055971,-0.930814,0.163908
การ,0.239587,-0.30362,0.079953,-0.453045,-0.528826,-0.161692,0.235725,-0.099673,0.691668,0.536159,...,-0.110436,-0.297495,-0.217414,0.045158,0.066647,0.190095,-0.304333,-0.724927,-0.995488,-0.716609
(,-0.120522,-0.355783,0.16818,-0.377733,-0.158624,-0.047249,0.36114,0.16146,0.913314,0.345037,...,0.116285,-0.318218,-0.356664,0.519889,0.130475,0.125772,0.101328,-0.382658,-1.205359,0.340139
),-0.086848,-0.155231,0.133015,-0.039913,0.183761,0.115142,-1.940854,-0.066565,-2.399744,0.146722,...,0.019406,-0.181474,0.099863,0.516092,0.201697,0.249139,0.252957,1.138815,-0.018209,0.232265


In [116]:
all_words = all_words[all_words['word'].isin(thai2vec.index)]

In [117]:
all_words[all_words['word']=='ทุจริต']

Unnamed: 0,word,cnt,is_stop_word
1439232,ทุจริต,95805,False


- Remove stop words (e.g. และ หรือ )

In [118]:
all_words = all_words[~all_words['is_stop_word']]

# Word2Vec

- Test Word

In [119]:
model.distance('เรียน','ศึกษา')

0.5563628673553467

In [120]:
model.distance('เรียน','เกเร')

0.8878999724984169

In [121]:
1 - model.cosine_similarities(model.get_vector('เรียน'), [model.get_vector('ศึกษา')])

array([0.5563628], dtype=float32)

In [122]:
model.cosine_similarities(model.get_vector('ความคิด') + model.get_vector('สร้างสรรค์'), [model.get_vector('ความคิดสร้างสรรค์')])

array([0.49381283], dtype=float32)

In [123]:
model.cosine_similarities(model.get_vector('พระราชา'), [model.get_vector('ราชินี')])

array([0.24455935], dtype=float32)

In [124]:
model.cosine_similarities(model.get_vector('ผู้หญิง'), [model.get_vector('ราชินี')])

array([0.20481385], dtype=float32)

In [125]:
model.cosine_similarities(model.get_vector('พระราชา') + model.get_vector('ผู้หญิง'), [model.get_vector('ราชินี')])

array([0.29745445], dtype=float32)

- Set Seed Words

In [126]:
import json
seed_word_dict = json.load(open('thai_seed_words.json','r'))
culture_list = []
for culture in seed_word_dict:
    culture_list += [{
        "culture": culture['culture'],
        "seed_words": [w['word'] for w in culture['seed_words']]
    }]

In [127]:
culture = culture_list[0]
for culture in culture_list:
    all_words[culture['culture']] = 1 - all_words['word'].apply(lambda x: np.min([model.distance(x, w) for w in culture['seed_words']]))

In [128]:
all_words.sort_values(culture['culture'], ascending=False).head(10)

Unnamed: 0,word,cnt,is_stop_word,นวัตกรรม,ธรรมาภิบาล,คุณภาพ,เคารพ,ทีม
1518368,สามัคคี,2100,False,0.305168,0.362181,0.218562,0.451115,1.0
1494266,ร่วมกับ,49859,False,0.281459,0.248762,0.117017,0.182443,0.574215
1580795,ให้ความร่วมมือ,8434,False,0.434687,0.507503,0.502668,0.341481,0.567438
1404531,จับมือ,417,False,0.338608,0.263883,0.455466,0.347381,0.534257
1474677,ภูมิใจ,607,False,0.253844,0.375428,0.322116,0.330857,0.533006
1563843,เผชิญหน้า,48,False,0.238341,0.295074,0.363754,0.235718,0.525496
1394502,ความสามัคคี,1698,False,0.385146,0.446978,0.218038,0.580703,0.504299
1452388,น้ำใจ,441,False,0.286995,0.543832,0.377213,0.74743,0.504222
1489785,รวมกลุ่ม,2313,False,0.426129,0.265762,0.340673,0.227242,0.499722
1438530,ทำสัญญา,4552,False,0.414734,0.291991,0.457581,0.220137,0.496972


In [129]:
all_words.to_pickle('word2vec_result.pickle')

# Example Word2Vec Result

- Example Top Related Keyword of "นวัตกรรม" (value in column "นวัตกรรม" indicates the similarity score between the word "นวัตกรรม" and the word in "word" column)

In [130]:
all_words.sort_values('นวัตกรรม', ascending=False).head(20)

Unnamed: 0,word,cnt,is_stop_word,นวัตกรรม,ธรรมาภิบาล,คุณภาพ,เคารพ,ทีม
1446624,นวัตกรรม,31967,False,1.0,0.354299,0.319564,0.29439,0.23319
1492583,ริเริ่ม,3364,False,1.0,0.3273,0.419641,0.282573,0.433864
1468442,พัฒนา,183306,False,1.0,0.24669,0.232528,0.131518,0.305168
1459590,ประดิษฐ์,2448,False,1.0,0.161473,0.194929,0.112031,0.156401
1516139,สร้างสรรค์,12772,False,1.0,0.325576,0.351208,0.318452,0.26754
1557472,เทคโนโลยี,76878,False,1.0,0.152743,0.175186,0.107813,0.149807
1396247,คิดค้น,3575,False,0.617744,0.242781,0.336439,0.225612,0.245349
1518826,สิ่งประดิษฐ์,221,False,0.58568,0.348573,0.273846,0.353244,0.233263
1394221,ความคิดสร้างสรรค์,1854,False,0.573561,0.525808,0.411409,0.558454,0.2858
1505407,วางรากฐาน,621,False,0.571945,0.423473,0.497671,0.336743,0.379857


- Example Top Related Keyword of "ธรรมาภิบาล"

In [131]:
all_words.sort_values('ธรรมาภิบาล', ascending=False).head(20)

Unnamed: 0,word,cnt,is_stop_word,นวัตกรรม,ธรรมาภิบาล,คุณภาพ,เคารพ,ทีม
1485210,ยุติธรรม,17608,False,0.231338,1.0,0.253447,0.431847,0.351741
1415749,ซื่อสัตย์,2959,False,0.325576,1.0,0.472141,0.516447,0.362181
1442748,ธรรมาภิบาล,14721,False,0.165712,1.0,0.157902,0.231245,0.146935
1491217,รับผิดชอบ,90540,False,0.3273,1.0,0.379834,0.200453,0.332295
1403920,จริยธรรม,55505,False,0.354299,1.0,0.305551,0.477577,0.348913
1576745,โปร่งใส,75994,False,0.290011,1.0,0.326442,0.343097,0.258747
1451862,น่าเชื่อถือ,3850,False,0.278831,1.0,0.376555,0.307071,0.173026
1525862,หลักวิชาการ,541,False,0.167206,0.99916,0.155584,0.222078,0.138438
1485220,ยุทธภัณฑ์,153,False,0.170214,0.999113,0.159838,0.234494,0.141927
1469296,พิธีรีตอง,7,False,0.168915,0.999072,0.159665,0.233893,0.150203


- Example Top Related Keyword of "ทีม"

In [132]:
all_words.sort_values('ทีม', ascending=False).head(20)

Unnamed: 0,word,cnt,is_stop_word,นวัตกรรม,ธรรมาภิบาล,คุณภาพ,เคารพ,ทีม
1518368,สามัคคี,2100,False,0.305168,0.362181,0.218562,0.451115,1.0
1494266,ร่วมกับ,49859,False,0.281459,0.248762,0.117017,0.182443,0.574215
1580795,ให้ความร่วมมือ,8434,False,0.434687,0.507503,0.502668,0.341481,0.567438
1404531,จับมือ,417,False,0.338608,0.263883,0.455466,0.347381,0.534257
1474677,ภูมิใจ,607,False,0.253844,0.375428,0.322116,0.330857,0.533006
1563843,เผชิญหน้า,48,False,0.238341,0.295074,0.363754,0.235718,0.525496
1394502,ความสามัคคี,1698,False,0.385146,0.446978,0.218038,0.580703,0.504299
1452388,น้ำใจ,441,False,0.286995,0.543832,0.377213,0.74743,0.504222
1489785,รวมกลุ่ม,2313,False,0.426129,0.265762,0.340673,0.227242,0.499722
1438530,ทำสัญญา,4552,False,0.414734,0.291991,0.457581,0.220137,0.496972


# Summarize Result

In [133]:
threshold_dict = {
    'นวัตกรรม': 0.4,
    'ธรรมาภิบาล': 0.4,
    'คุณภาพ': 0.4,
    'เคารพ': 0.4,
    'ทีม': 0.4
}

In [53]:
df = pd.read_pickle('entriesOct.pickle')

In [None]:
row_result_dict_list = []
for i in tqdm(range(df.shape[0])):
    row = df.iloc[i]
    row_result_df = pd.DataFrame(row['Tokenized'], columns=['word']).merge(all_words, how='left').fillna(0)
    row_result_dict = {}
    for culture, threshold in threshold_dict.items():
        row_result_dict[f'n_matched_keyword_{culture}'] = (row_result_df[culture]>threshold).sum()
        row_result_dict[f'pct_matched_keyword_{culture}'] = (row_result_df[culture]>threshold).sum()/row_result_df.shape[0]
        row_result_dict[f'avg_score_on_keyword_{culture}'] = row_result_df[culture].mean()
        row_result_dict[f'avg_score_on_matched_keyword_{culture}'] = row_result_df[row_result_df[culture]>threshold][culture].mean()
    row_result_dict_list += [row_result_dict]

In [69]:
result_df = pd.DataFrame(row_result_dict_list)

In [80]:
summarized_result_df = pd.concat([df.reset_index(drop=True), result_df], axis=1).fillna(0)

In [81]:
df.shape, result_df.shape, summarized_result_df.shape

((22278, 9), (22278, 20), (22278, 29))

- Company that mentioned  นวัตกรรม (innovation)

In [93]:
culture = 'นวัตกรรม'
summarized_result_df.groupby('Symbol')[f'pct_matched_keyword_{culture}'].mean().reset_index().sort_values(f'pct_matched_keyword_{culture}', ascending=False).head(10)

Unnamed: 0,Symbol,pct_matched_keyword_นวัตกรรม
580,SAT,0.03498
198,DIF,0.033291
26,AIT,0.032246
529,PT,0.028912
369,LEE,0.026866
646,SPI,0.026192
173,CPN,0.025966
1,3K-BAT,0.024516
386,LVT,0.023946
831,VCOM,0.023798


-  Metrics Description
    - `n_matched_keyword_<culture_name>`: The count of occurrences of words related to the seed words of the specified cultural domain (<culture_name>).
    - `pct_matched_keyword_<culture_name>`: The percentage of words related to the seed words of the cultural domain out of the total word count in the text.
    - `avg_score_on_keyword_<culture_name>`: The average similarity score between words in the article and the seed words associated with the cultural domain (<culture_name>).

In [111]:
culture_translator = {'นวัตกรรม':'innovation', "ธรรมาภิบาล":'integrity', "คุณภาพ":"quality", "เคารพ":"respect", 'ทีม':"teamwork"}
summarized_result_df.groupby('Symbol').agg(
    **{
        f'n_matched_keyword_{culture_translator[culture]}':(f'n_matched_keyword_{culture}','sum')
        for culture in threshold_dict.keys()
    },
    **{
        f'pct_matched_keyword_{culture_translator[culture]}':(f'pct_matched_keyword_{culture}','mean')
        for culture in threshold_dict.keys()
    },
    **{
        f'avg_score_on_keyword_{culture_translator[culture]}':(f'avg_score_on_keyword_{culture}','mean')
        for culture in threshold_dict.keys()
    }
).to_csv('summary_result.csv')