## 搭建一個bag of words模型

---

In [1]:
import pandas as pd
import nltk
#nltk.download()
import numpy as np

dataset = pd.read_csv('./D00_Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
corpus = dataset['Review'].values

### 從文本中取出所有單字

In [2]:
whole_words = []
[whole_words.append(word) for sentence in corpus for word in nltk.word_tokenize(sentence)]
print(f'未移除重複單字時共有{len(whole_words)}個單字')

未移除重複單字時共有12684個單字


### 移除重複單字

In [3]:
# 並非照順序編碼
# whole_words = list(set(whole_words))

# 照順序編碼
words=[]
[words.append(word) for word in whole_words if word not in words]
whole_words = words
print(f'共有{len(whole_words)}個單字')

共有2351個單字


### 建立字典使每一個單字有對應數值

In [4]:
word_index = {}
index_word = {}
n = 0
for word in whole_words:
    word_index[word] = n
    index_word[n] = word
    n += 1

In [5]:
# 格式化輸出前三十筆字典資料
print('\n'.join([f'{word:<10} : {str(word_index[word])}' for word in [*word_index][:30]]))

Wow        : 0
...        : 1
Loved      : 2
this       : 3
place      : 4
.          : 5
Crust      : 6
is         : 7
not        : 8
good       : 9
Not        : 10
tasty      : 11
and        : 12
the        : 13
texture    : 14
was        : 15
just       : 16
nasty      : 17
Stopped    : 18
by         : 19
during     : 20
late       : 21
May        : 22
bank       : 23
holiday    : 24
off        : 25
Rick       : 26
Steve      : 27
recommendation : 28
loved      : 29


In [6]:
# 格式化輸出前三十筆字典資料
print('\n'.join([f'{str(i):<2} : {str(index_word[i])}' for i in [*index_word][:30]]))

0  : Wow
1  : ...
2  : Loved
3  : this
4  : place
5  : .
6  : Crust
7  : is
8  : not
9  : good
10 : Not
11 : tasty
12 : and
13 : the
14 : texture
15 : was
16 : just
17 : nasty
18 : Stopped
19 : by
20 : during
21 : late
22 : May
23 : bank
24 : holiday
25 : off
26 : Rick
27 : Steve
28 : recommendation
29 : loved


## 轉換句子為bag of words型式

In [7]:
def _get_bag_of_words_vector(sentence, word_index_dic, whole_words):
    vector = np.zeros(len(whole_words))
    for word in nltk.word_tokenize(sentence):
        if word in whole_words:
            vector[word_index_dic[word]] += 1
    return vector

In [8]:
# nparray不以省略號替代輸出
np.set_printoptions(threshold = np.inf)

_get_bag_of_words_vector('Wow... Loved this place.', word_index, whole_words)

array([1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.