# the application of word2vec

In [1]:
from gensim.models import word2vec
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import os

In [None]:
# 上傳資料
!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part2_4.zip
!unzip -q NLP_part2_4.zip

In [2]:
# load word2vec model
model = word2vec.Word2Vec.load('word2vec_model/CBOW')

## similarity

In [3]:
# get most similarity with given words
# 可以替換關鍵字
model.wv.most_similar('KMT')

[('國民黨', 0.6417558789253235),
 ('DPP', 0.6396889686584473),
 ('kmt', 0.6338934898376465),
 ('dpp', 0.6175625324249268),
 ('民進黨', 0.5718200206756592),
 ('某黨', 0.5689526796340942),
 ('政黨', 0.5646135807037354),
 ('在野黨', 0.564574122428894),
 ('兩黨', 0.5438055992126465),
 ('黨內', 0.5420957803726196)]

In [4]:
# get most similarity with given words's relationship
# 可以替換關鍵字
model.wv.most_similar(positive=['KMT', '綠吱'], negative=['DPP'])

[('異端', 0.38107287883758545),
 ('英雄難過', 0.37355029582977295),
 ('兵家', 0.3718773424625397),
 ('kmter', 0.3690451383590698),
 ('滅族', 0.36561936140060425),
 ('先祖', 0.364986389875412),
 ('漢奸', 0.3648254871368408),
 ('老北', 0.36260485649108887),
 ('搧動', 0.360907644033432),
 ('明末', 0.3589898347854614)]

## clustering

In [23]:
# create a dictionary: words as key ; count as values
words = {word: vocab.count for word, vocab in model.wv.vocab.items()}

In [24]:
# sort and select the top 10000 count of words
words = sorted(words.items(), key=lambda x: x[1], reverse=True)
words = words[:10000]
words = np.array(words)[:, 0]
words

array(['人', '八卦', '有沒有', ..., 'But', '出入', '發佈'],
      dtype='<U20')

In [30]:
# extract the word vectors 
vecs = model.wv[words]

In [31]:
# run clustering algorithm
kmeans = KMeans(n_clusters=50)
cluster = kmeans.fit_predict(vecs)

In [44]:
# print the result
df = pd.DataFrame([words.tolist(), cluster.tolist()], index=['words', 'no. cluster']).T
df.head(n=5)

Unnamed: 0,words,no. cluster
0,人,40
1,八卦,47
2,有沒有,40
3,說,9
4,好,40


In [56]:
## print every cluster of words
data = pd.concat([d[['words']].reset_index(drop=True).rename(columns={'words': k}) for k, d in df.groupby('no. cluster')],
                 axis=1)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,冷氣,影片,政治,字,今天,歲,下,這種,錢,說,...,人,最,當時,on,旁邊,可愛,穿,八卦,新聞,日
1,停電,節目,支持,手機,剛剛,小孩,變成,喜歡,萬,想,...,有沒有,一種,名字,QQ,店,正妹,衣服,from,警察,活動
2,發電,歌,媒體,寫,每天,媽媽,社會,這是,元,一直,...,好,認為,當年,喔,附近,長,一件,Sent,陳,進行
3,反核,電視,覺青,網路,睡,小時候,發生,話,賺,注意,...,一個,來說,X,惹,家,妹子,內褲,問卦,事件,號
4,台電,唱,總統,使用,晚上,生,無法,好奇,花,幫,...,現在,關係,曾經,推,外面,臉,戴,JPTT,處理,單位
5,環保,直播,風向,電腦,昨天,年輕人,已,懂,有錢,直接,...,知道,似乎,全,幹,我家,正,t,請,表示,完成
6,限電,音樂,團體,內容,上班,家裡,未來,奇怪,出國,聽,...,真的,代表,叫做,欸,房間,年輕,穿著,一下,記者,中心
7,核能,聖,抗議,照片,回家,女兒,成,相信,賺錢,跑,...,覺得,算是,名,老婆,廁所,妹,褲子,請問,報導,未
8,缺電,結石,大運,找到,明天,爸爸,選擇,意思,房子,妹妹,...,是不是,重要,王,嘻嘻,隔壁,女孩,制服,卦,證明,計畫
9,核電,粉絲,反,資料,點,父母,所有,垃圾,億,遇到,...,應該,能力,曾,咪,進,帥哥,黑色,鄉民,網友,申請


In [None]:
# 可以嘗試調整 Kmeans 的參數 , etc. 分個 100 群如何 ?