In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


* @file NLP基礎/NLP_part3.ipynb
  * @brief NLP基礎 模型實作 

  * 此份程式碼是以教學為目的，附有完整的架構解說。

  * @author 人工智慧科技基金會 AI 工程師 - 康文瑋
  * Email: run963741@aif.tw
  * Resume: https://www.cakeresume.com/run963741

  * 最後更新日期: 2020/11/26

# 文本分類模型

比較三種特徵之間的分類表現

In [None]:
import pandas as pd
import xgboost as xgb
import pickle
import numpy as np
import os

os.chdir('/content/drive/Shared drives/類技術班教材/標準版/NLP基礎')

## 特徵ㄧ：特定字符出現次數


In [None]:
htl_count_feature = pd.read_csv('Data/htl_simple_count_feature.csv')

In [None]:
htl_count_feature.head()

Unnamed: 0,label,review,idx,count_eng_dgtl,count_chns,count_punctuation,count_question,count_xclmtn
0,1,"距離川沙公路較近,但是公交指示不對,如果是""蔡陸線""的話,會非常麻煩.建議用別的路線.房間較...",0,0,42,8,0,0
1,1,商務大牀房，房間很大，牀有2m寬，整體感覺經濟實惠不錯!,1,2,22,4,0,1
2,1,早餐太差，無論去多少人，那邊也不加食品的。酒店應該重視一下這個問題了。房間本身很好。,2,0,37,5,0,0
3,1,賓館在小街道上，不大好找，但還好北京熱心同胞很多~賓館設施跟介紹的差不多，房間很小，確實挺小...,3,0,147,27,0,0
4,1,"cbd中心,周圍沒什麼店鋪,說5星有點勉強.不知道爲什麼衛生間沒有電吹風",4,4,29,3,0,0


In [None]:
# 重新排列欄位順序
cols = ['idx','review','label','count_eng_dgtl','count_chns','count_punctuation','count_question','count_xclmtn']
htl_count_feature = htl_count_feature.loc[:,cols]
htl_count_feature.head()

Unnamed: 0,idx,review,label,count_eng_dgtl,count_chns,count_punctuation,count_question,count_xclmtn
0,0,"距離川沙公路較近,但是公交指示不對,如果是""蔡陸線""的話,會非常麻煩.建議用別的路線.房間較...",1,0,42,8,0,0
1,1,商務大牀房，房間很大，牀有2m寬，整體感覺經濟實惠不錯!,1,2,22,4,0,1
2,2,早餐太差，無論去多少人，那邊也不加食品的。酒店應該重視一下這個問題了。房間本身很好。,1,0,37,5,0,0
3,3,賓館在小街道上，不大好找，但還好北京熱心同胞很多~賓館設施跟介紹的差不多，房間很小，確實挺小...,1,0,147,27,0,0
4,4,"cbd中心,周圍沒什麼店鋪,說5星有點勉強.不知道爲什麼衛生間沒有電吹風",1,4,29,3,0,0


## 特徵二：詞袋模型 (Bag of words, BOW)


In [None]:
with open('Data/htl_bow.pickle', 'rb') as file:
    _, bow = pickle.load(file)
    print(bow.shape)

(7765, 24222)


In [None]:
# 因為詞袋模型會使得句子的向量非常長 (15162維)，這樣會使得模型訓練速度非常慢，所以我們只挑出現頻率前 256 高的詞出來當成 bow 特徵
most_bow_id = np.array((bow > 0).sum(axis=0)).argsort()[::-1][:256]

In [None]:
# 挑完之後，每個句子的 bow 特徵只剩下 256 維
bow = bow[:, most_bow_id]
print(bow.shape)

(7765, 256)


In [None]:
bow_data = htl_count_feature.iloc[:,:3]

In [None]:
print(bow_data.shape)
bow_data.head()

(7765, 3)


Unnamed: 0,idx,review,label
0,0,"距離川沙公路較近,但是公交指示不對,如果是""蔡陸線""的話,會非常麻煩.建議用別的路線.房間較...",1
1,1,商務大牀房，房間很大，牀有2m寬，整體感覺經濟實惠不錯!,1
2,2,早餐太差，無論去多少人，那邊也不加食品的。酒店應該重視一下這個問題了。房間本身很好。,1
3,3,賓館在小街道上，不大好找，但還好北京熱心同胞很多~賓館設施跟介紹的差不多，房間很小，確實挺小...,1
4,4,"cbd中心,周圍沒什麼店鋪,說5星有點勉強.不知道爲什麼衛生間沒有電吹風",1


In [None]:
bow_data = pd.concat([bow_data,pd.DataFrame(bow)], axis=1)

In [None]:
print(bow_data.shape)
bow_data.head()

(7765, 259)


Unnamed: 0,idx,review,label,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
0,0,"距離川沙公路較近,但是公交指示不對,如果是""蔡陸線""的話,會非常麻煩.建議用別的路線.房間較...",1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,商務大牀房，房間很大，牀有2m寬，整體感覺經濟實惠不錯!,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,早餐太差，無論去多少人，那邊也不加食品的。酒店應該重視一下這個問題了。房間本身很好。,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,賓館在小街道上，不大好找，但還好北京熱心同胞很多~賓館設施跟介紹的差不多，房間很小，確實挺小...,1,0,1,0,2,0,0,1,0,0,1,1,0,0,1,1,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,4,"cbd中心,周圍沒什麼店鋪,說5星有點勉強.不知道爲什麼衛生間沒有電吹風",1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 特徵三：Term frequency-inverse document frequency (tf-idf)


In [None]:
with open('Data/htl_tfidf.pickle', 'rb') as file:
    _, tfidf = pickle.load(file)

In [None]:
# 因為 tfidf 會使得句子的向量非常長 (15162維)，這樣會使得模型訓練速度非常慢，所以我們只挑出現頻率前 256 高的詞出來當成 bow 特徵
most_tfidf_id = np.array((tfidf > 0).sum(axis=0)).argsort()[::-1][:256]

In [None]:
# 挑完之後，每個句子的 bow 特徵只剩下 256 維
tfidf = tfidf[:, most_tfidf_id]
print(tfidf.shape)

(7765, 256)


In [None]:
tfidf_data = htl_count_feature.iloc[:,:3]

In [None]:
print(tfidf_data.shape)
tfidf_data.head()

(7765, 3)


Unnamed: 0,idx,review,label
0,0,"距離川沙公路較近,但是公交指示不對,如果是""蔡陸線""的話,會非常麻煩.建議用別的路線.房間較...",1
1,1,商務大牀房，房間很大，牀有2m寬，整體感覺經濟實惠不錯!,1
2,2,早餐太差，無論去多少人，那邊也不加食品的。酒店應該重視一下這個問題了。房間本身很好。,1
3,3,賓館在小街道上，不大好找，但還好北京熱心同胞很多~賓館設施跟介紹的差不多，房間很小，確實挺小...,1
4,4,"cbd中心,周圍沒什麼店鋪,說5星有點勉強.不知道爲什麼衛生間沒有電吹風",1


In [None]:
tfidf_data = pd.concat([tfidf_data,pd.DataFrame(tfidf)], axis=1)

In [None]:
print(tfidf_data.shape)
tfidf_data.head()

(7765, 259)


Unnamed: 0,idx,review,label,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,...,216,217,218,219,220,221,222,223,224,225,226,227,228,229,230,231,232,233,234,235,236,237,238,239,240,241,242,243,244,245,246,247,248,249,250,251,252,253,254,255
0,0,"距離川沙公路較近,但是公交指示不對,如果是""蔡陸線""的話,會非常麻煩.建議用別的路線.房間較...",1,0.0,1.544345,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.749651,0.0,0.0,0.0,0.0,0.0,0.0,3.047757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.439839,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.764554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,商務大牀房，房間很大，牀有2m寬，整體感覺經濟實惠不錯!,1,0.0,1.544345,0.0,1.964566,0.0,0.0,0.0,0.0,2.589802,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,早餐太差，無論去多少人，那邊也不加食品的。酒店應該重視一下這個問題了。房間本身很好。,1,1.524351,1.544345,0.0,0.0,0.0,0.0,0.0,2.53933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,賓館在小街道上，不大好找，但還好北京熱心同胞很多~賓館設施跟介紹的差不多，房間很小，確實挺小...,1,0.0,1.544345,0.0,3.929132,0.0,0.0,2.490711,0.0,0.0,2.682338,2.691382,0.0,0.0,2.805242,2.807594,0.0,0.0,0.0,0.0,0.0,9.197654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.764554,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.869914,0.0
4,4,"cbd中心,周圍沒什麼店鋪,說5星有點勉強.不知道爲什麼衛生間沒有電吹風",1,0.0,0.0,0.0,0.0,2.344679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.357451,0.0,0.0,0.0,3.338772,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 訓練模型

## Train test split

切割訓練集以及測試集

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
train, test = train_test_split(htl_count_feature, test_size=0.2, stratify=htl_count_feature['label'])
train_idx = np.array(train.index)
test_idx = np.array(test.index)

In [None]:
print('Training size: ', train_idx.shape)
print('Testing size: ', test_idx.shape)

Training size:  (6212,)
Testing size:  (1553,)


In [None]:
def train_model_pipeline(data):
  model = xgb.XGBClassifier()
  model.fit(data.iloc[train_idx, 3:], data.iloc[train_idx, 2], 
          eval_set=[(data.iloc[test_idx, 3:], data.iloc[test_idx, 2])])
  y_pred = model.predict(data.iloc[test_idx, 3:])

  report = classification_report(y_true=data.iloc[test_idx, 2], y_pred=y_pred)
  cnfm = confusion_matrix(y_true=data.iloc[test_idx, 2], y_pred=y_pred)
  cnfm = pd.DataFrame(cnfm, columns=['Pred_0','Pred_1'], index=['Actual_0','Actual_1'])

  return y_pred, report, cnfm

## 特徵一：特定字符出現次數

In [None]:
count_y_pred, count_report, count_cnfm = train_model_pipeline(htl_count_feature)

[0]	validation_0-error:0.291694
[1]	validation_0-error:0.291694
[2]	validation_0-error:0.292981
[3]	validation_0-error:0.290406
[4]	validation_0-error:0.290406
[5]	validation_0-error:0.289118
[6]	validation_0-error:0.289118
[7]	validation_0-error:0.288474
[8]	validation_0-error:0.289762
[9]	validation_0-error:0.289762
[10]	validation_0-error:0.289762
[11]	validation_0-error:0.289118
[12]	validation_0-error:0.289118
[13]	validation_0-error:0.289762
[14]	validation_0-error:0.289118
[15]	validation_0-error:0.288474
[16]	validation_0-error:0.290406
[17]	validation_0-error:0.285898
[18]	validation_0-error:0.287186
[19]	validation_0-error:0.28783
[20]	validation_0-error:0.28783
[21]	validation_0-error:0.286542
[22]	validation_0-error:0.286542
[23]	validation_0-error:0.288474
[24]	validation_0-error:0.28783
[25]	validation_0-error:0.287186
[26]	validation_0-error:0.287186
[27]	validation_0-error:0.287186
[28]	validation_0-error:0.287186
[29]	validation_0-error:0.28783
[30]	validation_0-error:

## 特徵二：詞袋模型 (Bag of words, BOW)

In [None]:
bow_y_pred, bow_report, bow_cnfm = train_model_pipeline(bow_data)

[0]	validation_0-error:0.250483
[1]	validation_0-error:0.250483
[2]	validation_0-error:0.251771
[3]	validation_0-error:0.253059
[4]	validation_0-error:0.253059
[5]	validation_0-error:0.247263
[6]	validation_0-error:0.2434
[7]	validation_0-error:0.239536
[8]	validation_0-error:0.237605
[9]	validation_0-error:0.235029
[10]	validation_0-error:0.22859
[11]	validation_0-error:0.229234
[12]	validation_0-error:0.22537
[13]	validation_0-error:0.227302
[14]	validation_0-error:0.227946
[15]	validation_0-error:0.224726
[16]	validation_0-error:0.22537
[17]	validation_0-error:0.219575
[18]	validation_0-error:0.218287
[19]	validation_0-error:0.222151
[20]	validation_0-error:0.212492
[21]	validation_0-error:0.214424
[22]	validation_0-error:0.214424
[23]	validation_0-error:0.213136
[24]	validation_0-error:0.21378
[25]	validation_0-error:0.216355
[26]	validation_0-error:0.220219
[27]	validation_0-error:0.214424
[28]	validation_0-error:0.214424
[29]	validation_0-error:0.21378
[30]	validation_0-error:0.2

## 特徵三：Term frequency-inverse document frequency (tf-idf)

In [None]:
tfidf_y_pred, tfidf_report, tfidf_cnfm = train_model_pipeline(tfidf_data)

[0]	validation_0-error:0.250483
[1]	validation_0-error:0.250483
[2]	validation_0-error:0.251771
[3]	validation_0-error:0.253059
[4]	validation_0-error:0.253059
[5]	validation_0-error:0.247263
[6]	validation_0-error:0.2434
[7]	validation_0-error:0.239536
[8]	validation_0-error:0.237605
[9]	validation_0-error:0.235029
[10]	validation_0-error:0.22859
[11]	validation_0-error:0.229234
[12]	validation_0-error:0.22537
[13]	validation_0-error:0.227302
[14]	validation_0-error:0.227946
[15]	validation_0-error:0.224726
[16]	validation_0-error:0.22537
[17]	validation_0-error:0.219575
[18]	validation_0-error:0.218287
[19]	validation_0-error:0.222151
[20]	validation_0-error:0.212492
[21]	validation_0-error:0.214424
[22]	validation_0-error:0.214424
[23]	validation_0-error:0.213136
[24]	validation_0-error:0.21378
[25]	validation_0-error:0.216355
[26]	validation_0-error:0.220219
[27]	validation_0-error:0.214424
[28]	validation_0-error:0.214424
[29]	validation_0-error:0.21378
[30]	validation_0-error:0.2

# Evaluation

## Classification report

In [None]:
print('特定字符出現次數: \n', count_report)
print('-'*55)
print('詞袋模型: \n', bow_report)
print('-'*55)
print('Tf-idf: \n', tfidf_report)

特定字符出現次數: 
               precision    recall  f1-score   support

           0       0.63      0.20      0.30       489
           1       0.72      0.95      0.82      1064

    accuracy                           0.71      1553
   macro avg       0.68      0.57      0.56      1553
weighted avg       0.69      0.71      0.66      1553

-------------------------------------------------------
詞袋模型: 
               precision    recall  f1-score   support

           0       0.79      0.59      0.68       489
           1       0.83      0.93      0.88      1064

    accuracy                           0.82      1553
   macro avg       0.81      0.76      0.78      1553
weighted avg       0.82      0.82      0.82      1553

-------------------------------------------------------
Tf-idf: 
               precision    recall  f1-score   support

           0       0.79      0.59      0.68       489
           1       0.83      0.93      0.88      1064

    accuracy                           0

## Confusion matrix

In [None]:
print('特定字符出現次數: \n')
display(count_cnfm)
print('-'*26)

print('詞袋模型: \n')
display(bow_cnfm)
print('-'*26)

print('Tf-idf: \n')
display(tfidf_cnfm)

特定字符出現次數: 



Unnamed: 0,Pred_0,Pred_1
Actual_0,97,392
Actual_1,56,1008


--------------------------
詞袋模型: 



Unnamed: 0,Pred_0,Pred_1
Actual_0,290,199
Actual_1,76,988


--------------------------
Tf-idf: 



Unnamed: 0,Pred_0,Pred_1
Actual_0,290,199
Actual_1,76,988
