In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


* @file NLP基礎/NLP_part2.ipynb
  * @brief NLP基礎 模型實作 

  * 此份程式碼是以教學為目的，附有完整的架構解說。

  * @author 人工智慧科技基金會 AI 工程師 - 康文瑋
  * Email: run963741@aif.tw
  * Resume: https://www.cakeresume.com/run963741

  * 最後更新日期: 2020/11/26

# 載入套件

In [None]:
import re
import pandas as pd
import numpy as np
import pickle
import os

from collections import Counter

os.chdir('/content/drive/Shared drives/類技術班教材/標準版/NLP基礎')

In [None]:
corp = pd.read_csv('Data/htl_preprocessed.csv')

In [None]:
print(corp.shape)
print(Counter(corp['label']))
corp.head()

(7765, 3)
Counter({1: 5322, 0: 2443})


Unnamed: 0,label,review,idx
0,1,"距離川沙公路較近,但是公交指示不對,如果是""蔡陸線""的話,會非常麻煩.建議用別的路線.房間較...",0
1,1,商務大牀房，房間很大，牀有2m寬，整體感覺經濟實惠不錯!,1
2,1,早餐太差，無論去多少人，那邊也不加食品的。酒店應該重視一下這個問題了。房間本身很好。,2
3,1,賓館在小街道上，不大好找，但還好北京熱心同胞很多~賓館設施跟介紹的差不多，房間很小，確實挺小...,3
4,1,"cbd中心,周圍沒什麼店鋪,說5星有點勉強.不知道爲什麼衛生間沒有電吹風",4


In [None]:
with open("Data/htl_cutted.pickle", "rb") as file:
    sentences = pickle.load(file)

In [None]:
print(sentences[0])

['距離', '川沙', '公路', '較', '近', ',', '但是', '公交', '指示', '不', '對', ',', '如果', '是', '"', '蔡陸線', '"', '的話', ',', '會', '非常', '麻煩', '.', '建議', '用', '別的', '路線', '.', '房間', '較', '爲', '簡單', '.']


# 特徵工程 (Feature engineering)

## 特徵ㄧ：特定字符出現次數

我們可以用一些簡單的特徵來作為句子的屬性

正規表示式 (Regular Expression): https://atedev.wordpress.com/2007/11/23/%E6%AD%A3%E8%A6%8F%E8%A1%A8%E7%A4%BA%E5%BC%8F-regular-expression/

中文正規表示式: http://blog.csdn.net/gatieme/article/details/43235791 


In [None]:
corp['count_eng_dgtl'] = corp['review'].str.count('[a-zA-Z0-9]') # 英文數字出現次數
corp['count_chns'] = corp['review'].str.count('[\u4e00-\u9fff]') # 中文字出現次數
corp['count_punctuation'] = corp['review'].str.replace('[\w\s]','').str.len() # 標點符號出現次數
corp['count_question'] = corp['review'].str.count('[？?]') # 問號出現次數
corp['count_xclmtn'] = corp['review'].str.count('[!！]') # 驚嘆號出現次數

In [None]:
corp.head()

Unnamed: 0,label,review,idx,count_eng_dgtl,count_chns,count_punctuation,count_question,count_xclmtn
0,1,"距離川沙公路較近,但是公交指示不對,如果是""蔡陸線""的話,會非常麻煩.建議用別的路線.房間較...",0,0,42,8,0,0
1,1,商務大牀房，房間很大，牀有2m寬，整體感覺經濟實惠不錯!,1,2,22,4,0,1
2,1,早餐太差，無論去多少人，那邊也不加食品的。酒店應該重視一下這個問題了。房間本身很好。,2,0,37,5,0,0
3,1,賓館在小街道上，不大好找，但還好北京熱心同胞很多~賓館設施跟介紹的差不多，房間很小，確實挺小...,3,0,147,27,0,0
4,1,"cbd中心,周圍沒什麼店鋪,說5星有點勉強.不知道爲什麼衛生間沒有電吹風",4,4,29,3,0,0


In [None]:
# 觀察這些特徵與標籤之間的關係
corp.loc[:,['label','count_eng_dgtl','count_chns','count_punctuation','count_question','count_xclmtn']].corr()

Unnamed: 0,label,count_eng_dgtl,count_chns,count_punctuation,count_question,count_xclmtn
label,1.0,-0.074595,-0.16728,-0.165415,-0.077905,-0.173617
count_eng_dgtl,-0.074595,1.0,0.331889,0.423684,0.114137,0.159519
count_chns,-0.16728,0.331889,1.0,0.926052,0.203807,0.340433
count_punctuation,-0.165415,0.423684,0.926052,1.0,0.280138,0.401554
count_question,-0.077905,0.114137,0.203807,0.280138,1.0,0.194517
count_xclmtn,-0.173617,0.159519,0.340433,0.401554,0.194517,1.0


In [None]:
corp.to_csv('Data/htl_simple_count_feature.csv', index=False)

## 特徵二：詞袋模型 (Bag of words, BOW)

詞袋模型是在 1954 年由 Zellig Harris 在  Distributional Structure 這本書上所提出，詞袋模型是一個簡單統計詞頻的演算法，以下圖為例，首先會將所有句子進行斷詞，接著會統計句子中每個詞出現的頻率，例如 'at' 這個詞不曾在句子中出現過，次數會顯示 0，'bat' 則在句子中出現兩次。

<figure>
<center>
<img src='https://drive.google.com/uc?export=view&id=1RTXMaG-tlDSvkkYs0niZP5YZD-NEAyj1' width="500"/>
<figcaption>Turning raw text into a bag of words representation.</figcaption></center>
</figure>

Resource : https://towardsdatascience.com/from-word-embeddings-to-pretrained-language-models-a-new-age-in-nlp-part-1-7ed0c7f3dfc5

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform([' '.join(x) for x in sentences]) # bow 預設的斷詞方式是用空白，這邊使用 ' '.join() 來連接每個詞

In [None]:
bow = bow.toarray()

In [None]:
# 印出第一個句子的 bow 向量
print('bow: ', bow.shape)
bow[0]

bow:  (7765, 24222)


array([0, 0, 0, ..., 0, 0, 0])

In [None]:
# 字典 {word: index}
vectorizer.vocabulary_

{'距離': 20690,
 '川沙': 8469,
 '公路': 4019,
 '但是': 2958,
 '公交': 3969,
 '指示': 10494,
 '如果': 7486,
 '蔡陸線': 19016,
 '的話': 16058,
 '非常': 23063,
 '麻煩': 23958,
 '建議': 9020,
 '別的': 4419,
 '路線': 20724,
 '房間': 9935,
 '簡單': 17159,
 '商務': 5861,
 '牀房': 15219,
 '整體': 11446,
 '感覺': 9683,
 '經濟': 17557,
 '實惠': 7923,
 '不錯': 2180,
 '早餐': 11729,
 '無論': 14822,
 '多少': 6820,
 '那邊': 21589,
 '食品': 23411,
 '酒店': 21686,
 '應該': 9776,
 '重視': 21800,
 '一下': 1508,
 '問題': 5931,
 '本身': 12393,
 '賓館': 20398,
 '街道': 19275,
 '不大': 2038,
 '還好': 21556,
 '北京': 4804,
 '熱心': 14930,
 '同胞': 5554,
 '很多': 9173,
 '設施': 19790,
 '介紹': 2811,
 '差不多': 8552,
 '確實': 16613,
 '加上': 4620,
 '低價位': 2983,
 '因素': 6260,
 '還是': 21559,
 '無超所值': 14827,
 '環境': 15600,
 '衚衕': 19281,
 '安靜': 7673,
 '整潔': 11440,
 '暖氣': 11999,
 '好足': 7468,
 '優勢': 3696,
 '出發': 4263,
 '步行': 13068,
 '不到': 2002,
 '分鐘': 4350,
 '可以': 5380,
 '梅蘭芳': 12651,
 '故居': 11357,
 '等等': 17066,
 '京味': 2703,
 '小衚衕': 8234,
 '北海': 4829,
 '總之': 17707,
 '推薦給': 10780,
 '節約': 17143,
 '消費': 14026,
 '自助'

In [None]:
with open("Data/htl_bow.pickle", "wb") as file:
    pickle.dump([vectorizer, bow], file)

### 使用 BOW 挑出前 10 個出現次數最高的詞

In [None]:
# 建立反向字典 {index: word}
id2word = {v:k for k, v in vectorizer.vocabulary_.items()}
id2word

{20690: '距離',
 8469: '川沙',
 4019: '公路',
 2958: '但是',
 3969: '公交',
 10494: '指示',
 7486: '如果',
 19016: '蔡陸線',
 16058: '的話',
 23063: '非常',
 23958: '麻煩',
 9020: '建議',
 4419: '別的',
 20724: '路線',
 9935: '房間',
 17159: '簡單',
 5861: '商務',
 15219: '牀房',
 11446: '整體',
 9683: '感覺',
 17557: '經濟',
 7923: '實惠',
 2180: '不錯',
 11729: '早餐',
 14822: '無論',
 6820: '多少',
 21589: '那邊',
 23411: '食品',
 21686: '酒店',
 9776: '應該',
 21800: '重視',
 1508: '一下',
 5931: '問題',
 12393: '本身',
 20398: '賓館',
 19275: '街道',
 2038: '不大',
 21556: '還好',
 4804: '北京',
 14930: '熱心',
 5554: '同胞',
 9173: '很多',
 19790: '設施',
 2811: '介紹',
 8552: '差不多',
 16613: '確實',
 4620: '加上',
 2983: '低價位',
 6260: '因素',
 21559: '還是',
 14827: '無超所值',
 15600: '環境',
 19281: '衚衕',
 7673: '安靜',
 11440: '整潔',
 11999: '暖氣',
 7468: '好足',
 3696: '優勢',
 4263: '出發',
 13068: '步行',
 2002: '不到',
 4350: '分鐘',
 5380: '可以',
 12651: '梅蘭芳',
 11357: '故居',
 17066: '等等',
 2703: '京味',
 8234: '小衚衕',
 4829: '北海',
 17707: '總之',
 10780: '推薦給',
 17143: '節約',
 14026: '消費',
 1839

In [None]:
# 計算每個詞出現的總次數
sum_ = np.array(bow.sum(axis=0))

In [None]:
# 挑出現次數前 10 高的 index
most_sum_id = sum_.argsort()[::-1][:10].tolist()
# 將這些 index 透過 dict 轉回詞
features = [id2word[i] for i in most_sum_id]

print("Top 10 frequency's word index: \n", most_sum_id)
print("Top 10 frequency's word: \n", features)

Top 10 frequency's word index: 
 [21686, 9935, 12256, 2180, 13479, 9832, 3813, 13180, 9683, 11729]
Top 10 frequency's word: 
 ['酒店', '房間', '服務', '不錯', '沒有', '我們', '入住', '比較', '感覺', '早餐']


In [None]:
# 對照原始句子
data = pd.DataFrame(bow[corp.idx.to_numpy(),:][:,most_sum_id], columns=features)
data[:10]

Unnamed: 0,酒店,房間,服務,不錯,沒有,我們,入住,比較,感覺,早餐
0,0,1,0,0,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0,1,0
2,1,1,0,0,0,0,0,0,0,1
3,0,1,0,2,0,0,0,1,0,0
4,0,0,0,0,1,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0
6,1,0,0,1,0,0,0,1,0,1
7,1,0,0,1,0,0,0,0,0,0
8,10,2,1,0,1,0,1,0,3,0
9,1,1,0,1,0,0,0,3,0,1


In [None]:
corp.loc[:10, 'review']

0     距離川沙公路較近,但是公交指示不對,如果是"蔡陸線"的話,會非常麻煩.建議用別的路線.房間較...
1                          商務大牀房，房間很大，牀有2m寬，整體感覺經濟實惠不錯!
2            早餐太差，無論去多少人，那邊也不加食品的。酒店應該重視一下這個問題了。房間本身很好。
3     賓館在小街道上，不大好找，但還好北京熱心同胞很多~賓館設施跟介紹的差不多，房間很小，確實挺小...
4                  cbd中心,周圍沒什麼店鋪,說5星有點勉強.不知道爲什麼衛生間沒有電吹風
5              總的來說，這樣的酒店配這樣的價格還算可以，希望他趕快裝修，給我的客人留些好的印象
6     價格比比較不錯的酒店。這次免費升級了，感謝前臺服務員。房子還好，地毯是新的，比上次的好些。早...
7                                  不錯，在同等檔次酒店中應該是值得推薦的！
8     入住麗晶，感覺很好。因爲是新酒店，的確有淡淡的油漆味，房間內較新。房間大小合適，衛生間設備齊...
9     1。酒店比較新，裝潢和設施還不錯，只是房間有些油漆味。2。早餐還可以，只是品種不是很多。3。...
10    我住的是特色標間，所謂特色，是有些類似家的感覺。寢具不是單調的白色，是條紋和大格子的，感覺很...
Name: review, dtype: object

## 特徵三：Term frequency-inverse document frequency (tf-idf)

tf-idf 是用來表示一個詞在句子中有多重要的指標，tf-idf 包含兩個要素：

* tf (term-frequency, 詞頻):
$$
tf_{i,j}=\frac{某詞出現在文章A的次數}{所有詞出現在文章A的總次數}
=\frac{n_{i,j}}{\sum_kn_{k,j}}
$$

若單獨以某一文章 A 來說，某詞的 tf 愈高表示某詞出現在文章 A 的次數越多，這個指標只單獨考慮每一篇文章，並不會考慮文章之間的關係。例如以英文來說的`the, a, ...`，或是以中文的`了, 喔, ...`，這種常常出現但是無意義的詞會被 tf 所凸顯出來，所以接下來還有第二個指標 idf 來對 tf 的缺點進行改進。

* idf (inverse document frequency, 逆向檔案頻率):
$$
idf_i=\log\frac{文章總數}{包含某詞的文章數量}=\log\frac{N}{df_i}
$$

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer(norm=None) 
tfidf = vectorizer.fit_transform([' '.join(x) for x in sentences])

In [None]:
tfidf = tfidf.toarray()

In [None]:
# 第一個句子的 tf-idf 向量
print('tfidf: ', tfidf.shape)
tfidf[0]

tfidf:  (7765, 24222)


array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
with open("Data/htl_tfidf.pickle", "wb") as file:
    pickle.dump([vectorizer, tfidf], file)

### 使用 tf-idf 挑出前 10 個重要性最高的詞

In [None]:
# 建立反向字典 {index: word}
id2word = {v:k for k, v in vectorizer.vocabulary_.items()}
id2word

{20690: '距離',
 8469: '川沙',
 4019: '公路',
 2958: '但是',
 3969: '公交',
 10494: '指示',
 7486: '如果',
 19016: '蔡陸線',
 16058: '的話',
 23063: '非常',
 23958: '麻煩',
 9020: '建議',
 4419: '別的',
 20724: '路線',
 9935: '房間',
 17159: '簡單',
 5861: '商務',
 15219: '牀房',
 11446: '整體',
 9683: '感覺',
 17557: '經濟',
 7923: '實惠',
 2180: '不錯',
 11729: '早餐',
 14822: '無論',
 6820: '多少',
 21589: '那邊',
 23411: '食品',
 21686: '酒店',
 9776: '應該',
 21800: '重視',
 1508: '一下',
 5931: '問題',
 12393: '本身',
 20398: '賓館',
 19275: '街道',
 2038: '不大',
 21556: '還好',
 4804: '北京',
 14930: '熱心',
 5554: '同胞',
 9173: '很多',
 19790: '設施',
 2811: '介紹',
 8552: '差不多',
 16613: '確實',
 4620: '加上',
 2983: '低價位',
 6260: '因素',
 21559: '還是',
 14827: '無超所值',
 15600: '環境',
 19281: '衚衕',
 7673: '安靜',
 11440: '整潔',
 11999: '暖氣',
 7468: '好足',
 3696: '優勢',
 4263: '出發',
 13068: '步行',
 2002: '不到',
 4350: '分鐘',
 5380: '可以',
 12651: '梅蘭芳',
 11357: '故居',
 17066: '等等',
 2703: '京味',
 8234: '小衚衕',
 4829: '北海',
 17707: '總之',
 10780: '推薦給',
 17143: '節約',
 14026: '消費',
 1839

In [None]:
# 將每個詞在每個句子的 tfidf 全部加起來再平均，也就是每個詞的平均重要性
avg = tfidf.sum(axis=0) / (tfidf!=0).sum(axis=0)

# 將出現次數少於 10 的詞的 平均tf-idf 設為 0
avg[(tfidf!=0).sum(axis=0)<10] = 0

In [None]:
# 挑出 top 10 的 word index
most_avg_id = avg.argsort()[::-1][:10].tolist()
# 將這些 index 透過 dict 轉回詞
features = [id2word[i] for i in most_avg_id]

print("Top 10 tfidf's word index: \n", most_avg_id)
print("Top 10 tfidf's word: \n", features)

Top 10 tfidf's word index: 
 [8969, 22346, 7788, 8061, 20120, 8758, 11415, 13663, 8081, 13811]
Top 10 tfidf's word: 
 ['廬山', '閣下', '客棧', '尊重', '證件', '平方米', '敦煌', '泰山', '對方', '洲際']


In [None]:
# 對照原始句子
data = pd.DataFrame(tfidf[corp.idx.to_numpy(),:][:,most_avg_id], columns=features)
data[365:375]

Unnamed: 0,廬山,閣下,客棧,尊重,證件,平方米,敦煌,泰山,對方,洲際
365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
368,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
369,0.0,0.0,0.0,0.0,0.0,0.0,7.318453,0.0,0.0,0.0
370,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
371,0.0,0.0,0.0,0.0,0.0,0.0,7.318453,0.0,0.0,0.0
372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
374,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
corp.loc[365:375, 'review']

365    酒店環境還可以，距離公交車站比較近，出行方便。一般的服務員很和氣會主動向你問好，倒是看着象管...
366    前臺接待服務太差，去前臺辦手續的時候愛理不理的，，我住的是3棟，不跟前臺一棟房子，很難找，也...
367    選新興主要就是看重他的地理位置，機場大巴到門口，到西站也很近，出門逛街有地鐵，緊挨城鄉貿易中...
368     已經去過一次了，這次也是從攜程訂的房間，但是好像跟前臺溝通的不好，到了居然找不到我們的名字，奇怪
369    酒店離長途汽車站很近，走路5分鐘就到了。整體環境不錯，房間很乾淨。我住的是標準間，作爲三星級...
370    坦白說，酒店比我想象中的好。在三星酒店中屬於較好的一類。員工的服務意識很強，不象是內地城市。...
371    住的是680元一晚的標準間，我與老婆都住的很開心。我們這趟從蘭州去嘉峪關再來到敦煌，只有這家...
372                 酒店環境不錯，比較有特點，附樓比較一般。沒怎麼用服務員，感覺服務中上吧。
373      自然環境一流，賓館內部裝飾（附樓）也相當不錯，說不上豪華，但是很整潔清爽。總體感覺是寧靜寫意。
374             環境很好，花園很大，服務生態度也很好，距離機場很近很方便，不錯，下次考慮再次入住
375    離湖州市區比較遠,打車不是很方便.不過門口有公交車,班次還比較多.最好自己有車.湖景房,並不...
Name: review, dtype: object