In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


* @file NLP基礎/NLP_part5.ipynb
  * @brief NLP基礎 模型實作 

  * 此份程式碼是以教學為目的，附有完整的架構解說。

  * @author 人工智慧科技基金會 AI 工程師 - 康文瑋
  * Email: run963741@aif.tw
  * Resume: https://www.cakeresume.com/run963741

  * 最後更新日期: 2020/11/26

# 載入套件

In [None]:
import pandas as pd
import xgboost as xgb
import pickle
import numpy as np
import os

from gensim.models import word2vec

os.chdir('/content/drive/Shared drives/類技術班教材/標準版/NLP基礎')

# 特徵前處理

我們前面已經透過許多特徵擷取的方法來萃取文本特徵，接下來，我們比較前一個章節 (NLP_part4) 所產生的特徵之間的分類表現，分別有：

* Continuous bag of words (CBOW)
* Skip-gram

In [None]:
htl_count_feature = pd.read_csv('Data/htl_simple_count_feature.csv')

In [None]:
with open('Data/htl_cutted.pickle', 'rb') as f:
  data = pickle.load(f)

In [None]:
data[0]

['距離',
 '川沙',
 '公路',
 '較',
 '近',
 ',',
 '但是',
 '公交',
 '指示',
 '不',
 '對',
 ',',
 '如果',
 '是',
 '"',
 '蔡陸線',
 '"',
 '的話',
 ',',
 '會',
 '非常',
 '麻煩',
 '.',
 '建議',
 '用',
 '別的',
 '路線',
 '.',
 '房間',
 '較',
 '爲',
 '簡單',
 '.']

## Continuous bag of words (CBOW)

這邊會做一個比較特別的前處理，因為大部分的模型都是輸入一筆資料並得到一個預測值，資料通常是一個向量，但是當把句子中所有詞都轉成詞向量之後，整個句子就變成矩陣，也就沒辦法丟進模型裡面，所以這邊將句子中所有詞向量做平均，就形成一個句向量。

In [None]:
# 載入 cbow 模型
cbow_model = word2vec.Word2Vec.load("word2vec_model/cbow.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# 過濾不在 vocabulary 裡面的詞
data_filtered = [[w for w in l if w in cbow_model.wv] for l in data]

# 將句子中的每個詞向量加起來做平均，變成句向量
cbow_avg_vector = []

for l in data_filtered:
    if len(l)==0: # 句子中可能有所有詞全部都不曾出現在 vocabulary 中，假設為 0
        cbow_avg_vector.append(np.array([0]*128))
    else: # 將每個詞都轉成詞向量，然後平均
        sent_embed = np.mean([cbow_model.wv[w] for w in l], axis=0)
        cbow_avg_vector.append(sent_embed)

In [None]:
cbow_avg_vector = np.array(cbow_avg_vector)

In [None]:
# 共有 7765 個句子，每個句子的句向量為 128 維
cbow_avg_vector.shape

(7765, 128)

## Skip-gram


In [None]:
# 載入 skip-gram 模型
skipgram_model = word2vec.Word2Vec.load("word2vec_model/skipgram.model")

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [None]:
# 過濾不在 vocabulary 裡面的詞
data_filtered = [[w for w in l if w in skipgram_model.wv] for l in data]

# 將句子中的每個詞向量加起來做平均，變成句向量
skipgram_avg_vector = []

for l in data_filtered:
    if len(l)==0: # 句子中skipgram_avg_vector可能有所有詞全部都不曾出現在 vocabulary 中，假設為 0
        skipgram_avg_vector.append(np.array([0]*128))
    else: # 將每個詞都轉成詞向量，然後平均
        sent_embed = np.mean([skipgram_model.wv[w] for w in l], axis=0)
        skipgram_avg_vector.append(sent_embed)

In [None]:
skipgram_avg_vector = np.array(skipgram_avg_vector)

In [None]:
# 共有 7765 個句子，每個句子的句向量為 128 維
skipgram_avg_vector.shape

(7765, 128)

# 訓練模型

## Train test split

切割訓練集以及測試集

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
train, test = train_test_split(htl_count_feature, test_size=0.2, stratify=htl_count_feature['label'])
train_idx = np.array(train.index)
test_idx = np.array(test.index)

In [None]:
print('Training size: ', train_idx.shape)
print('Testing size: ', test_idx.shape)

Training size:  (6212,)
Testing size:  (1553,)


In [None]:
def train_model_pipeline(data):
  model = xgb.XGBClassifier(n_estimators=500)
  model.fit(data[train_idx,:], htl_count_feature['label'][train_idx], 
          eval_set=[(data[test_idx, :], htl_count_feature['label'][test_idx])])
  y_pred = model.predict(data[test_idx, :])

  report = classification_report(y_true=htl_count_feature['label'][test_idx], y_pred=y_pred)
  cnfm = confusion_matrix(y_true=htl_count_feature['label'][test_idx], y_pred=y_pred)
  cnfm = pd.DataFrame(cnfm, columns=['Pred_0','Pred_1'], index=['Actual_0','Actual_1'])

  return y_pred, report, cnfm

## CBOW 

In [None]:
cbow_y_pred, cbow_report, cbow_cnfm = train_model_pipeline(cbow_avg_vector)

[0]	validation_0-error:0.238892
[1]	validation_0-error:0.207985
[2]	validation_0-error:0.198326
[3]	validation_0-error:0.195106
[4]	validation_0-error:0.189311
[5]	validation_0-error:0.186735
[6]	validation_0-error:0.183516
[7]	validation_0-error:0.18094
[8]	validation_0-error:0.179652
[9]	validation_0-error:0.186091
[10]	validation_0-error:0.186735
[11]	validation_0-error:0.182872
[12]	validation_0-error:0.181584
[13]	validation_0-error:0.181584
[14]	validation_0-error:0.175789
[15]	validation_0-error:0.172569
[16]	validation_0-error:0.173857
[17]	validation_0-error:0.174501
[18]	validation_0-error:0.167418
[19]	validation_0-error:0.171281
[20]	validation_0-error:0.168706
[21]	validation_0-error:0.16613
[22]	validation_0-error:0.166774
[23]	validation_0-error:0.164198
[24]	validation_0-error:0.164842
[25]	validation_0-error:0.161623
[26]	validation_0-error:0.163554
[27]	validation_0-error:0.160335
[28]	validation_0-error:0.161623
[29]	validation_0-error:0.160979
[30]	validation_0-erro

## Skip-gram

In [None]:
skipgram_y_pred, skipgram_report, skipgram_cnfm = train_model_pipeline(skipgram_avg_vector)

[0]	validation_0-error:0.226658
[1]	validation_0-error:0.232453
[2]	validation_0-error:0.203477
[3]	validation_0-error:0.207341
[4]	validation_0-error:0.207985
[5]	validation_0-error:0.200258
[6]	validation_0-error:0.193818
[7]	validation_0-error:0.194462
[8]	validation_0-error:0.200258
[9]	validation_0-error:0.197038
[10]	validation_0-error:0.196394
[11]	validation_0-error:0.197038
[12]	validation_0-error:0.193818
[13]	validation_0-error:0.187379
[14]	validation_0-error:0.184804
[15]	validation_0-error:0.183516
[16]	validation_0-error:0.183516
[17]	validation_0-error:0.182872
[18]	validation_0-error:0.185448
[19]	validation_0-error:0.18094
[20]	validation_0-error:0.177077
[21]	validation_0-error:0.177077
[22]	validation_0-error:0.175789
[23]	validation_0-error:0.173213
[24]	validation_0-error:0.173857
[25]	validation_0-error:0.171925
[26]	validation_0-error:0.167418
[27]	validation_0-error:0.166774
[28]	validation_0-error:0.16613
[29]	validation_0-error:0.166774
[30]	validation_0-erro

# Evaluation

## Classification report

In [None]:
print('CBOW: \n', cbow_report)
print('-'*55)
print('Skip-gram: \n', skipgram_report)

CBOW: 
               precision    recall  f1-score   support

           0       0.78      0.78      0.78       489
           1       0.90      0.90      0.90      1064

    accuracy                           0.86      1553
   macro avg       0.84      0.84      0.84      1553
weighted avg       0.86      0.86      0.86      1553

-------------------------------------------------------
Skip-gram: 
               precision    recall  f1-score   support

           0       0.80      0.79      0.79       489
           1       0.90      0.91      0.91      1064

    accuracy                           0.87      1553
   macro avg       0.85      0.85      0.85      1553
weighted avg       0.87      0.87      0.87      1553



## Confusion matrix

In [None]:
print('CBOW: \n')
display(cbow_cnfm)
print('-'*26)

print('Skip-gram: \n')
display(skipgram_cnfm)

CBOW: 



Unnamed: 0,Pred_0,Pred_1
Actual_0,379,110
Actual_1,109,955


--------------------------
Skip-gram: 



Unnamed: 0,Pred_0,Pred_1
Actual_0,385,104
Actual_1,95,969
