# ws 03 Keras Embedding

In [52]:
from tensorflow.keras.preprocessing.text import Tokenizer


s1 = 'CNN is good.'        
s2 = 'python is easy and good.'
s3 = 'So good so happy enjoy with Python.'

sentences = [s1, s2, s3]
sentences

['CNN is good.',
 'python is easy and good.',
 'So good so happy enjoy with Python.']

In [53]:
tk = Tokenizer() 

tk.fit_on_texts(sentences)

In [54]:
tk.document_count 

3

In [55]:
tk.word_docs 

defaultdict(int,
            {'is': 2,
             'cnn': 1,
             'good': 3,
             'python': 2,
             'and': 1,
             'easy': 1,
             'enjoy': 1,
             'with': 1,
             'so': 1,
             'happy': 1})

In [56]:
tk.word_counts  

OrderedDict([('cnn', 1),
             ('is', 2),
             ('good', 3),
             ('python', 2),
             ('easy', 1),
             ('and', 1),
             ('so', 2),
             ('happy', 1),
             ('enjoy', 1),
             ('with', 1)])

In [57]:
tk.word_counts.items()

odict_items([('cnn', 1), ('is', 2), ('good', 3), ('python', 2), ('easy', 1), ('and', 1), ('so', 2), ('happy', 1), ('enjoy', 1), ('with', 1)])

In [58]:
tk.word_index

{'good': 1,
 'is': 2,
 'python': 3,
 'so': 4,
 'cnn': 5,
 'easy': 6,
 'and': 7,
 'happy': 8,
 'enjoy': 9,
 'with': 10}

In [59]:
tk.word_index['is']

2

In [60]:
tk.word_index['good']

1

In [61]:
sents_enc = tk.texts_to_sequences(sentences)
sents_enc

[[5, 2, 1], [3, 2, 6, 7, 1], [4, 1, 4, 8, 9, 10, 3]]

In [62]:
from keras.preprocessing.sequence import pad_sequences

max_len = 6  # max column len

sents_pad = pad_sequences(sents_enc, truncating='post', 
                          padding='post', maxlen=max_len)
print(sents_pad)

[[ 5  2  1  0  0  0]
 [ 3  2  6  7  1  0]
 [ 4  1  4  8  9 10]]


In [63]:
print(s1)
tk.texts_to_sequences([s1])

CNN is good.


[[5, 2, 1]]

In [64]:
print(s2)
tk.texts_to_sequences([s2])

python is easy and good.


[[3, 2, 6, 7, 1]]

In [65]:
for i in sentences:
    print(i)
    print(tk.texts_to_sequences([i]))

CNN is good.
[[5, 2, 1]]
python is easy and good.
[[3, 2, 6, 7, 1]]
So good so happy enjoy with Python.
[[4, 1, 4, 8, 9, 10, 3]]


In [66]:

print('[', end ='')
for i,lst in enumerate(sents_enc):
    if i == 0:
        print(f'{lst},')
    else:
        if i == 1:
            print(f' {lst},')
        else:
            print(f' {lst}]')


[[5, 2, 1],
 [3, 2, 6, 7, 1],
 [4, 1, 4, 8, 9, 10, 3]]


## Decode

In [67]:
tk.sequences_to_texts(sents_enc)

['cnn is good',
 'python is easy and good',
 'so good so happy enjoy with python']

In [68]:
tk.sequences_to_texts(sents_pad)

['cnn is good', 'python is easy and good', 'so good so happy enjoy with']

In [69]:
# word_index = unique tokens
# vocab_size = len(tk.word_index)+1
# vocab_size      # no of words (vocab)

## Model - Embedding Keras

In [73]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding 

vocab_size = len(tk.word_index) + 1

print(vocab_size)

embed_len = 5

model = Sequential()
model.add(Embedding(vocab_size, embed_len, input_length=max_len))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()  

11


In [74]:
vectors = model.predict(sents_pad)

print(vectors.round(3)) 

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step
[[[ 0.018 -0.021  0.033  0.002  0.022]
  [ 0.049  0.04  -0.032  0.05  -0.014]
  [ 0.002  0.015 -0.016  0.021 -0.03 ]
  [-0.049  0.036  0.017 -0.002  0.003]
  [-0.049  0.036  0.017 -0.002  0.003]
  [-0.049  0.036  0.017 -0.002  0.003]]

 [[ 0.038 -0.031  0.017 -0.001 -0.024]
  [ 0.049  0.04  -0.032  0.05  -0.014]
  [ 0.031 -0.045  0.041  0.022 -0.006]
  [-0.033  0.008 -0.041 -0.004  0.045]
  [ 0.002  0.015 -0.016  0.021 -0.03 ]
  [-0.049  0.036  0.017 -0.002  0.003]]

 [[-0.04   0.038  0.024 -0.006 -0.044]
  [ 0.002  0.015 -0.016  0.021 -0.03 ]
  [-0.04   0.038  0.024 -0.006 -0.044]
  [ 0.031  0.039  0.015 -0.015  0.02 ]
  [-0.029 -0.033  0.016  0.014  0.038]
  [ 0.024 -0.013 -0.045 -0.003 -0.006]]]


In [75]:
vectors.shape

(3, 6, 5)

In [76]:
print(vectors[0][0].round(3))
# output[0].round(3)

[ 0.018 -0.021  0.033  0.002  0.022]


In [77]:
vectors[1][0].round(3)

array([ 0.038, -0.031,  0.017, -0.001, -0.024], dtype=float32)

In [78]:
print('sentence   word   Vector')
print('----------------------------------------------------')
for i, sents in enumerate(vectors):   
    for j, word_v in enumerate(sents):
        words = tk.sequences_to_texts(sents_pad)[i].split()  
        if j < len(words):
            print(f'{i+1:6} {words[j]:>8} {word_v.round(3)}')  
        else:
            print(f'{i+1:6} {"":>8} {word_v.round(3)}')      

sentence   word   Vector
----------------------------------------------------
     1      cnn [ 0.018 -0.021  0.033  0.002  0.022]
     1       is [ 0.049  0.04  -0.032  0.05  -0.014]
     1     good [ 0.002  0.015 -0.016  0.021 -0.03 ]
     1          [-0.049  0.036  0.017 -0.002  0.003]
     1          [-0.049  0.036  0.017 -0.002  0.003]
     1          [-0.049  0.036  0.017 -0.002  0.003]
     2   python [ 0.038 -0.031  0.017 -0.001 -0.024]
     2       is [ 0.049  0.04  -0.032  0.05  -0.014]
     2     easy [ 0.031 -0.045  0.041  0.022 -0.006]
     2      and [-0.033  0.008 -0.041 -0.004  0.045]
     2     good [ 0.002  0.015 -0.016  0.021 -0.03 ]
     2          [-0.049  0.036  0.017 -0.002  0.003]
     3       so [-0.04   0.038  0.024 -0.006 -0.044]
     3     good [ 0.002  0.015 -0.016  0.021 -0.03 ]
     3       so [-0.04   0.038  0.024 -0.006 -0.044]
     3    happy [ 0.031  0.039  0.015 -0.015  0.02 ]
     3    enjoy [-0.029 -0.033  0.016  0.014  0.038]
     3     with [ 0.0

In [79]:
tk.word_index.keys()


dict_keys(['good', 'is', 'python', 'so', 'cnn', 'easy', 'and', 'happy', 'enjoy', 'with'])

## ws 04, see "01 ws word2vec"

In [80]:
from gensim.models import Word2Vec
model = Word2Vec.load(
    'http://www.arts.chula.ac.th/ling/wp-content/uploads/TNCc5model.bin').wv

In [81]:
model.vector_size

100

In [82]:
len(model.index_to_key)

31078

In [83]:
model.most_similar('หมา')

[('แมว', 0.7827969193458557),
 ('สุนัข', 0.7660499811172485),
 ('ควาย', 0.7402681112289429),
 ('ลิง', 0.7233365774154663),
 ('วัว', 0.6935527920722961),
 ('กระต่าย', 0.6711753606796265),
 ('งูเห่า', 0.6605747938156128),
 ('ไก่', 0.6319968700408936),
 ('งู', 0.6309268474578857),
 ('หมาบ้า', 0.6199687123298645)]

In [84]:
model.most_similar('อาเจียน')

[('คลื่นไส้', 0.7747560143470764),
 ('สำลัก', 0.7553960680961609),
 ('หน้ามืด', 0.7404484152793884),
 ('เป็นลม', 0.7189688682556152),
 ('ปวดท้อง', 0.7065293192863464),
 ('อ้วก', 0.7015395164489746),
 ('ท้องเสีย', 0.689079999923706),
 ('ตาลาย', 0.6866936683654785),
 ('ขย้อน', 0.6792916059494019),
 ('ถ่ายปัสสาวะ', 0.6768748164176941)]

In [85]:
# model.most_similar('พ่อค้า', topn=5)
model.most_similar('รับประทาน', topn=5)

[('กิน', 0.8338596820831299),
 ('ทาน', 0.7438191175460815),
 ('ปรุง', 0.7300216555595398),
 ('ดื่ม', 0.7109257578849792),
 ('เสิร์ฟ', 0.6651374697685242)]

In [86]:
model.most_similar('ทารก')

[('เด็ก', 0.635191023349762),
 ('ครรภ์', 0.5767548680305481),
 ('เด็กหญิง', 0.5542507171630859),
 ('คลอด', 0.553459882736206),
 ('ผู้ป่วย', 0.5305246114730835),
 ('ร่างกาย', 0.5298979878425598),
 ('อวัยวะ', 0.5297532081604004),
 ('ลูก', 0.5190483927726746),
 ('วัย', 0.514176607131958),
 ('มดลูก', 0.5121011734008789)]

In [87]:
model.similarity('รถ','ยานพาหนะ')  # ฟอง

0.54875463

In [88]:
pairs = [
    ('รถ', 'ยานพาหนะ'),   
    ('รถ', 'เครื่องบิน'),   
    ('รถ', 'ข้าวโพด'),     
    ('ปลูก', 'ข้าวโพด'),        
]
for w1, w2 in pairs:
    print('%r\t%r\t%.2f' % (w1, w2, model.similarity(w1, w2)))

'รถ'	'ยานพาหนะ'	0.55
'รถ'	'เครื่องบิน'	0.72
'รถ'	'ข้าวโพด'	0.13
'ปลูก'	'ข้าวโพด'	0.46


In [89]:
model.doesnt_match(['หนัง','เพลง','หนังสือ','ดนตรี'])

'หนังสือ'