# word2vec步驟分解

1.載入需要的套件<br>
2.指定辭典檔<br>
3.載入文件<br>
4.載入停止字檔<br>
5.進行分詞<br>
6.分詞結果存檔<br>
7.載入word2vec套件<br>
8.從檔案中取出句子<br>
9.向量化並建立模型<br>
10.儲存模型<br>
11.查詢相似的詞語<br>


In [2]:
#1.載入需要的套件
import requests
from lxml import etree
from io import StringIO
import jieba
import pandas as pd
import time
from jieba import posseg as pseg
import random

In [17]:
#2.指定辭典檔
jieba.set_dictionary('./mydict/dict.txt.big')
#加入自定義辭典
jieba.load_userdict('./mydict/mydict.txt')

Building prefix dict from C:\Users\leavi\Desktop\catcher_nlp\mydict\dict.txt.big ...
Loading model from cache C:\Users\leavi\AppData\Local\Temp\jieba.ue81af2ce4832b1549caa38b1b6fa9b38.cache
Loading model cost 1.383 seconds.
Prefix dict has been built successfully.


In [18]:
#3.開檔
fileAllLines = []
#避免檔案過大造成記憶體不足使用 with open 及 read()控制讀取數量
with open('./file/healthy_all.txt','r',encoding="utf-8") as fileLine:
    for line in iter(lambda: fileLine.read(1024), ''):
        #print(line)
        fileAllLines.append(line)
        
new_line = ' '.join(fileAllLines)
#new_line

In [52]:
#4.加入stop word
with open(file='./mydict/stop_words.txt', mode='r', encoding='utf-8') as file:
    stop_words = file.read().split('\n')
    
stop_words.append('\n')  ## 文章中有許多分行符號，這邊加入停用字中，可以把它拿掉
stop_words.append('\n\n')
#stop_words


In [20]:
#5.進行匯入停止字的分詞
start_time = time.time()
import jieba.posseg as pseg

seg = []
for i in range(len(fileAllLines)):
    #cut_list = list( jieba.cut(fileAllLines[i], cut_all = False) )
    #seg.append([' '.join([ item for item in cut_list if len(item) > 1 ])])
    cut_result = list(pseg.cut(fileAllLines[i], use_paddle=False))
    each_line_list = []
    for w, p in cut_result:#標註詞性
        #print("%s: %s"%(w, p))
        #if w not in stop_words and p in ['n', 'nr','ns','nt','nz','nl','ng', 't', 's','vn','v','vd','a','ad','an','d','r']:
        #上面這行加入了以詞性作為過濾的條件
        if w not in stop_words :
                #print("%s: %s"%(w, p))
            each_line_list.append(w)
        new_line = ' '.join(each_line_list)
    #print(new_line)
    seg.append(new_line)
print("--- spend %s seconds ---" % (time.time() - start_time))

print(len(seg))

--- spend 2588.9279396533966 seconds ---
10436


In [23]:
#6.分詞結果存檔
segSaveFile = 'segmentNormalDone.txt'
with open(segSaveFile, 'wb') as saveFile:
    for i in range(len(seg)):
        #words = seg[i][0].encode('utf-8')
        words = seg[i].encode('utf-8')
        if len(words) > 0:
            saveFile.write(words)
            saveFile.write('\n'.encode())

In [24]:
#7.載入word2vec 函式庫
from gensim.models import word2vec

In [25]:
#8.一行一行的從檔案中取出句子：一行一個句子，句子中的字必須預先處理好並以空白隔開
sentences = word2vec.LineSentence("segmentNormalDone.txt")

## word2vec 參數調整指南
參考本篇:https://reurl.cc/odjeVl

size=100：詞向量的維度大小，維度太小會無法有效表達詞與詞的關係，維度太大會使關係太稀疏而難以找出規則,Kaggle比賽詞向量維度常介於200-300之間

iter=5：訓練的回數，訓練過少會使得詞關係過為鬆散，訓練過度又會使得詞關係過為極端

sg=0：sg=0時以CBOW來訓練，sg=1時以Skip-gram來訓練。某些詞彙可能只出現在特定的主題當中，所以文本中可能有許多低頻詞，而在特性上，Skip-gram比CBOW通常對低頻詞有更好的訓練效果<br>
CBOW和Skip-gram的比較:https://reurl.cc/1492vp 

window=5：CBOW下決定Word2Vec一次取多少詞來預測中間詞（Skip-gram的狀況是反過來的）

workers=3：訓練用的線程數量（可以加快訓練速度）

min_count=5：出現次數大於等於min_count的詞，才會納入Word2Vec的詞典中


In [26]:
#9.訓練模型
#size: 向量維度 = 300
#sg: 0(CBOW), 1(Skip-gram)
start_time = time.time()
model = word2vec.Word2Vec(sentences, size=300, sg=1, window=10, workers=3, min_count=2)
print("--- spend %s seconds ---" % (time.time() - start_time))

model

--- spend 147.65637469291687 seconds ---


<gensim.models.word2vec.Word2Vec at 0x2063a3ab280>

In [27]:
#10.儲存訓練好的word2vec 模型
model.save("word2vecNormal.model")

model.corpus_total_words


2596601

In [14]:
#11.測試詞相似度
model.similar_by_word('每天')

  model.similar_by_word('每天')


[('天天', 0.5993406176567078),
 ('抽時間', 0.5872159004211426),
 ('混日子', 0.5863246917724609),
 ('搞壞', 0.5706547498703003),
 ('緊迫', 0.5639457702636719),
 ('第一件', 0.5604555010795593),
 ('無所事事', 0.5559400320053101),
 ('太爽了', 0.5554307699203491),
 ('每天晚上', 0.5546926259994507),
 ('治裝', 0.5546207427978516)]

In [45]:
#詞相似度
model.similar_by_word('一直')

  model.similar_by_word('一直')


[('繞圈圈', 0.5793926119804382),
 ('無時無刻', 0.5786219239234924),
 ('耍賴', 0.5743733644485474),
 ('討厭鬼', 0.5736790299415588),
 ('逼出來', 0.5722613334655762),
 ('使喚', 0.5674321055412292),
 ('前兆', 0.5667661428451538),
 ('氣不氣', 0.5665849447250366),
 ('痛處', 0.565506100654602),
 ('搞不清', 0.5642781257629395)]

In [46]:
#詞相似度
model.similar_by_word('總是')

  model.similar_by_word('總是')


[('認真思考', 0.6209027767181396),
 ('捉摸不定', 0.6182634830474854),
 ('自尋煩惱', 0.6158167123794556),
 ('磨光', 0.6137054562568665),
 ('太多太', 0.6103761792182922),
 ('一笑置之', 0.6096199154853821),
 ('簇擁', 0.6092824935913086),
 ('種下', 0.6088994145393372),
 ('混帳', 0.6081991195678711),
 ('問句', 0.6081584692001343)]

In [29]:
#詞相似度
model.similar_by_word('我')

  model.similar_by_word('我')


[('我自', 0.7645814418792725),
 ('被寵', 0.7535476684570312),
 ('你們好', 0.7526658177375793),
 ('避諱', 0.7524116039276123),
 ('送光', 0.751254677772522),
 ('床伴', 0.7510131597518921),
 ('我他', 0.7506459951400757),
 ('瞞著', 0.7503862380981445),
 ('好受', 0.7500801682472229),
 ('無地自容', 0.7480854988098145)]

In [28]:
#詞相似度
model.similar_by_word('我們')

  model.similar_by_word('我們')


[('倆', 0.5566164255142212),
 ('我倆', 0.5341084003448486),
 ('分道揚鑣', 0.5319393873214722),
 ('擦身', 0.5300780534744263),
 ('不歡而散', 0.5290753245353699),
 ('就此結束', 0.5288700461387634),
 ('分分合合', 0.5283598303794861),
 ('再見面', 0.5270602107048035),
 ('上話', 0.526908278465271),
 ('談天', 0.5268542170524597)]

In [49]:
#詞相似度
model.similar_by_word('吃藥')

  model.similar_by_word('吃藥')


[('停藥', 0.8395404815673828),
 ('止痛藥', 0.8304799795150757),
 ('止痛針', 0.8045859932899475),
 ('拿藥', 0.8018680214881897),
 ('擦藥', 0.799972653388977),
 ('腎', 0.7976259589195251),
 ('中耳炎', 0.7966483235359192),
 ('開藥', 0.7949013710021973),
 ('類固醇', 0.7904659509658813),
 ('安眠藥', 0.7859857082366943)]

In [50]:
#詞相似度
model.similar_by_word('憂鬱')

  model.similar_by_word('憂鬱')


[('易怒', 0.8091617822647095),
 ('鬱悶', 0.798853874206543),
 ('排山倒海', 0.789068877696991),
 ('焦慮症', 0.7777386903762817),
 ('頭疼', 0.7764595150947571),
 ('沈重', 0.7755311131477356),
 ('大起大落', 0.767436683177948),
 ('週期性', 0.76662278175354),
 ('心悸', 0.765610933303833),
 ('孱弱', 0.764356255531311)]

In [48]:
#詞相似度
model.similar_by_word('真的')

  model.similar_by_word('真的')


[('怕得要命', 0.705108642578125),
 ('對號入座', 0.7047827243804932),
 ('小玉', 0.7005411982536316),
 ('翻舊帳', 0.697583794593811),
 ('著看', 0.696243405342102),
 ('打趣', 0.6948140859603882),
 ('賭爛', 0.6941170692443848),
 ('夭壽', 0.6932826638221741),
 ('真要', 0.6919928789138794),
 ('自求多福', 0.6915366053581238)]

In [15]:
#詞相似度
model.similar_by_word('想')

  model.similar_by_word('想')


[('床伴', 0.6878759860992432),
 ('一大半', 0.6875162720680237),
 ('打嘴巴', 0.6872358322143555),
 ('宣泄', 0.685401201248169),
 ('安份', 0.6826120615005493),
 ('我什', 0.6822878122329712),
 ('看開一點', 0.6817077398300171),
 ('多嘴', 0.6812175512313843),
 ('了能', 0.680487871170044),
 ('把拔', 0.6802255511283875)]

In [16]:
#詞相似度
model.similar_by_word('說')

  model.similar_by_word('說')


[('超沒', 0.7028159499168396),
 ('每句', 0.6993929147720337),
 ('多嘴', 0.6971871852874756),
 ('兩三句', 0.6940829753875732),
 ('還用', 0.6927374601364136),
 ('第二句', 0.6920994520187378),
 ('委婉', 0.6907167434692383),
 ('講講', 0.6892420053482056),
 ('兩句話', 0.688892126083374),
 ('哪句', 0.6886162757873535)]

In [30]:
#詞相似度
model.similar_by_word('工作')

  model.similar_by_word('工作')


[('職場', 0.6395502090454102),
 ('薪水', 0.6197562217712402),
 ('打工', 0.6065786480903625),
 ('應徵', 0.6059691309928894),
 ('兼職', 0.6041263341903687),
 ('服務業', 0.596893846988678),
 ('生管', 0.5964556336402893),
 ('坦白說', 0.5935444235801697),
 ('統包', 0.5910071730613708),
 ('第一份', 0.5893994569778442)]

In [32]:
#詞相似度
model.similar_by_word('已經')

  model.similar_by_word('已經')


[('書蟲', 0.5230495929718018),
 ('構', 0.5222415328025818),
 ('腦袋瓜', 0.5210082530975342),
 ('愈來愈好', 0.518495500087738),
 ('_____', 0.5178402066230774),
 ('千面人', 0.5151078701019287),
 ('始料未及', 0.5147507190704346),
 ('魔咒', 0.5142943859100342),
 ('另一人', 0.5136208534240723),
 ('澈底', 0.5111826658248901)]

In [40]:
#詞相似度
model.similar_by_word('根本')

  model.similar_by_word('根本')


[('硬要', 0.5969036817550659),
 ('兩回事', 0.5935841202735901),
 ('媽會', 0.5888224244117737),
 ('不差', 0.5870436429977417),
 ('打探', 0.5856208801269531),
 ('完完全全', 0.5837949514389038),
 ('連跟', 0.5819148421287537),
 ('死死的', 0.5799906253814697),
 ('翻舊帳', 0.579847514629364),
 ('第二點', 0.578641414642334)]

In [43]:
#詞相似度
model.similar_by_word('吃')

  model.similar_by_word('吃')


[('不吃', 0.7688150405883789),
 ('飯', 0.6978113651275635),
 ('火鍋', 0.6910183429718018),
 ('牛排', 0.6874236464500427),
 ('青菜', 0.6854840517044067),
 ('好吃', 0.6800402402877808),
 ('生魚片', 0.678730845451355),
 ('漢堡', 0.6744285821914673),
 ('雞塊', 0.6730027794837952),
 ('水餃', 0.6709229946136475)]

In [45]:
#詞相似度
model.similar_by_word('無法')

  model.similar_by_word('無法')


[('難以', 0.5840492248535156),
 ('辦法', 0.5727012753486633),
 ('幾近', 0.533754825592041),
 ('動彈', 0.5314500331878662),
 ('黑與白', 0.5178382992744446),
 ('很難', 0.5153633952140808),
 ('兩端', 0.5150221586227417),
 ('出此下策', 0.5148170590400696),
 ('別無選擇', 0.5138683915138245),
 ('切斷', 0.5135929584503174)]

In [46]:
#詞相似度
model.similar_by_word('不想')

  model.similar_by_word('不想')


[('理人', 0.6152536869049072),
 ('部會', 0.5851227045059204),
 ('不要緊', 0.5810213088989258),
 ('一走了之', 0.5809998512268066),
 ('駝鳥', 0.5807380080223083),
 ('作愛', 0.5797613859176636),
 ('透漏', 0.5721461772918701),
 ('撕破臉', 0.5719670057296753),
 ('不誠實', 0.5698884129524231),
 ('湊和', 0.5694880485534668)]

In [47]:
#詞相似度
model.similar_by_word('身體')

  model.similar_by_word('身體')


[('變差', 0.6893565654754639),
 ('調養', 0.6531015634536743),
 ('糖尿病', 0.6482625007629395),
 ('強壯', 0.6443980932235718),
 ('休養', 0.6407372951507568),
 ('不適', 0.6363716125488281),
 ('體質', 0.6350127458572388),
 ('代謝', 0.631886899471283),
 ('乾嘔', 0.6309178471565247),
 ('胃腸', 0.6303087472915649)]

In [53]:
#詞相似度
model.similar_by_word('抑鬱')

  model.similar_by_word('抑鬱')


[('刻劃', 0.9095689058303833),
 ('虛華', 0.9095242023468018),
 ('由衷感謝', 0.9085531234741211),
 ('形體', 0.9070810079574585),
 ('虛榮心', 0.9069384932518005),
 ('活過', 0.9061567783355713),
 ('完滿', 0.9050050973892212),
 ('地活', 0.9038348197937012),
 ('情慾', 0.9015408754348755),
 ('一道道', 0.9014558792114258)]