<a href="https://colab.research.google.com/github/Shopping-Yuan/bert_project/blob/main/keyword_extraction/keyword_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [166]:
# 原始功能:BERT做无监督分词(由 蘇劍林 提供)
# 介绍：https://kexue.fm/archives/7476
# 我將分詞功能擴充為:檢查兩個中文詞組間的相關性，以判斷是否為連接的新詞(例如:機器 學習)

In [167]:
#輸入段落
text = "機器學習基本上可以稱為聖經之類的書"

In [168]:
# 安裝keras-bert函式庫以及BERT-wwm模型
!pip -q install keras-bert
!pip -q install keras-transformer
!gdown --id "1SQJZmilEwCqffyb4MbQBPlr3-N20sLnS" --output chinese_wwm_L-12_H-768_A-12.zip
!unzip  chinese_wwm_L-12_H-768_A-12.zip

Downloading...
From: https://drive.google.com/uc?id=1SQJZmilEwCqffyb4MbQBPlr3-N20sLnS
To: /content/chinese_wwm_L-12_H-768_A-12.zip
382MB [00:02, 173MB/s]
Archive:  chinese_wwm_L-12_H-768_A-12.zip
replace publish/vocab.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [170]:
# 設置BERT的編碼器
config_path = 'publish/bert_config.json'
checkpoint_path = 'publish/bert_model.ckpt'
vocab_path = 'publish/vocab.txt'
token_dict = {}
with open(vocab_path, 'r', encoding='utf8') as f:
    for line in f.readlines():
        token = line.strip()
        token_dict[token] = len(token_dict)
print("辭典長度:", len(token_dict))
from keras_bert import Tokenizer
tokenizer = Tokenizer(token_dict)

辭典長度: 21128


In [171]:
#預處理:將段落文字轉為簡體,濾除英文,並以標點符號切分句子(句子之間被標點隔開,前後的詞不可能是相連的新詞)
!pip install opencc-python-reimplemented
from opencc import OpenCC
import re
cc = OpenCC('t2s')
#標點符號集合
punct = "，,|:|[|]|。|、|…"
r1 = '[a-zA-Z0-9]' 
text = re.sub(r1, '', text)
text_convert = cc.convert(text)
sentence_list = re.split(punct,text_convert)



In [172]:
#載入jieba字典
import jieba.analyse
import os
from urllib.request import urlretrieve
if not os.path.exists ("dict.txt.big"):
  url = "https://github.com/fxsjy/jieba/raw/master/extra_dict/dict.txt.big"
  urlretrieve(url, "dict.txt.big")
jieba.set_dictionary ('dict.txt.big')

In [173]:
#使用jieba分詞,輸出word_pair_list,,由欲處理的詞組組成
word_pair_list=[]
for n in sentence_list:
  words = "".join(filter(lambda x: x not in punct,jieba.cut(n)))
  sentence_cut =  jieba.lcut(words)
  for m in range(len(sentence_cut)-1):
    sentence_word = [sentence_cut[m],sentence_cut[m+1]]
    word_pair_list.append(sentence_word)

Building prefix dict from /content/dict.txt.big ...
Loading model from cache /tmp/jieba.u501edca284da514cb68b53a20324f4e3.cache
Loading model cost 1.789 seconds.
Prefix dict has been built successfully.


In [174]:
#輸出paragraph,待處理的整個段落
text_convert_list = jieba.lcut(text_convert)
words = []
for word in text_convert_list:
      if word not in punct:          
          words.append(word)
paragraph = "".join(words)

In [175]:
#準備標籤step1,此函數可以標記目標詞在段落中的位置(需要依序輸入word_pair_list中的元素,再和paragraph比較)
def word_position(content,keyword):
  keyword_set = set(keyword)
  content_word_segment= []
  content = jieba.lcut(content)
  words = []
  for word in content:
        if word not in punct:          
            words.append(word)
  content = words
  for n in content:
    test, se = tokenizer.encode(n)
    len_of_token = (len(test)-2)
    if len_of_token>0:
      if n in keyword_set:
        content_word_segment.append(100+len_of_token)
      else:
        try:
          x = n.lower()
          if x in keyword_set:
            content_word_segment.append(100+len_of_token)
          else:
            content_word_segment.append(len_of_token)
        except:
          content_word_segment.append(len_of_token)
  return np.array(content_word_segment)

In [176]:
#設定BERT輸入的句子長度
SEQ_LEN = len(paragraph)+2

In [177]:
#準備標籤step2,根據step1的結果,輸出元素為0和1的list,1代表該位置的字為目標詞的一部分,list長度=paragraph長度
def prepare_label(content_word_segment:np.array):
  content_char_segment= np.array([0])
  for n in content_word_segment:
    if n >100:
      content_char_segment = np.concatenate([content_char_segment,np.ones(n-100)])
    else:
      content_char_segment = np.concatenate([content_char_segment,np.zeros(n)])
  content_char_segment = np.concatenate([content_char_segment,np.zeros(SEQ_LEN)])
  content_char_segment = content_char_segment[:SEQ_LEN]
  return content_char_segment

In [178]:
#開始標記,word_pair_list中每一個詞組會產生3個list,分別用0和1表示詞1,整個詞組,詞2在paragraph中的位置,輸出標記position_list
position_list = []
for n in word_pair_list:
  position_list.append(prepare_label(word_position(paragraph,[n[0]])))
  position_list.append(prepare_label(word_position(paragraph,n)))
  position_list.append(prepare_label(word_position(paragraph,[n[1]])))

In [179]:
#此函數輸入段落編碼(下一欄的ids)並mask在position_list中值為1的對應位置
def mask(ids:list,mask_terms:list):
  for i in range(len(mask_terms)):
    if mask_terms[i]==1:
      ids[i]=103
  return ids

In [180]:
#準備BERT模型的二項輸入,第一項為句子的編碼ids,將ids代入mask函數,mask相對應的詞組
input = []
for n in range(len(position_list)):
  ids,seg = tokenizer.encode(paragraph)
  input.append(mask(ids,position_list[n]))
import tensorflow as tf
input_ids = tf.convert_to_tensor(input, dtype=tf.int32)
input_seg = tf.convert_to_tensor(np.zeros_like(input), dtype=tf.int32)

In [181]:
# 配置BERT模型
from keras_bert import load_trained_model_from_checkpoint
model = load_trained_model_from_checkpoint(
    config_path,
    checkpoint_path,
    training=False,
    trainable=False,
    seq_len=SEQ_LEN,
    )

In [182]:
#使用BERT訓練過的詞向量預測被mask的詞組
vectors = model.predict([input_ids, input_seg])



In [183]:
#準備關聯性列表
d_list = []

In [184]:
#定義歐式距離函數
import numpy as np
def dist(x, y):
    return np.sqrt(((x - y)**2).sum())

In [185]:
#計算每一詞組中,二個詞的關聯性,將結果輸出到關聯性列表d_list中
import numpy as np
threshold = 10
for n in range(1, round(len(position_list)/3+1)):
    v1 = np.zeros_like(vectors.shape[2]).astype('float32')
    v2_1 = np.zeros_like(vectors.shape[2]).astype('float32')
    v2_2 = np.zeros_like(vectors.shape[2]).astype('float32')
    v3 = np.zeros_like(vectors.shape[2]).astype('float32')
    len_of_token_1 = 0
    len_of_token_2 = 0
    for m in range(len(position_list[3*n-2])):
      if position_list[3*n-3][m]==1:
        v1 = v1 + vectors[3 * n-3][m]
        v2_1 = v2_1 + vectors[3 * n-2][m]
        len_of_token_1 += 1
      elif position_list[3*n-1][m]==1:
        v3 = v3 + vectors[3 * n-1][m]
        v2_2 = v2_2 + vectors[3 * n-2][m]
        len_of_token_2 += 1
    v1 = v1/len_of_token_1
    v2_1 = v2_1/len_of_token_1
    v2_2 = v2_2/len_of_token_2
    v3 = v3/len_of_token_2  
    d1 = dist(v1, v2_1)
    d2 = dist(v2_2 ,v3)
    d = (d1 + d2) / 2
    d_list.append(d)

In [186]:
#讀取濾除字表
!gdown --id "1VloMIYrUdlKbPutAkmFMgQT-1vc3N9-J"
stop_words_list = []
with open('stop_words.txt', 'r') as f:
  stop_words_list = [line.strip() for line in f]
  t = f.readlines()
  for i in t:
    stop_words_list.append(cc.convert(i))

Downloading...
From: https://drive.google.com/uc?id=1VloMIYrUdlKbPutAkmFMgQT-1vc3N9-J
To: /content/stop_words.txt
  0% 0.00/2.00k [00:00<?, ?B/s]100% 2.00k/2.00k [00:00<00:00, 3.35MB/s]


In [187]:
#過濾字表中的字後,根據d_list中關連性的大小,尋找對應的詞組,輸出段落中最可能是相連接的詞組
for n in range(len(d_list)):
  k = d_list.index(max(d_list))
  if word_pair_list[k][0] not in stop_words_list and word_pair_list[k][1] not in stop_words_list:
    with open("new_keyword.txt", "a") as output:
      output.write("%s" % word_pair_list[k])
    print(word_pair_list[k])
    break
  else:
    del d_list[k]
    del word_pair_list[k]

['圣经', '之类']
