# 斷詞

## 英文斷詞

In [21]:
# 英文斷詞
from tensorflow.keras.preprocessing.text import text_to_word_sequence
print("英文斷詞：", text_to_word_sequence("Hi, I am Vanson. How are you?"))

英文斷詞： ['hi', 'i', 'am', 'vanson', 'how', 'are', 'you']


## 中文斷詞

In [12]:
# Install jieba（結巴）
!pip install jieba

# Get the Tokenization Dictionary for Traditional Chinese
import os
Dictionary_File = 'dict.txt.big'

if not os.path.isfile(Dictionary_File):
    os.system('wget https://raw.githubusercontent.com/cnchi/datasets/master/' + Dictionary_File)

# Get the Stop Words File for Traditional Chinese
StopWords_File = "stopWords_big5.txt"

if not os.path.isfile(StopWords_File):
    os.system('wget https://raw.githubusercontent.com/cnchi/datasets/master/' + StopWords_File)



In [13]:
import jieba

# Set Dictionary for Traditional Chinese
jieba.set_dictionary(Dictionary_File)

# Tokenization
result = list(jieba.cut("我喜歡跑步，你呢？"))
print("中文斷詞（有標點）：", result)

# Remove Stop Words from Set
stopWords = set("$!&#%\()+-*/_,. 　?:;'\"<=>^`|~[]{}’0123456789?_“”、。《》！，：；？「」（）") #這串只能拷貝貼上.
print("中文斷詞（無標點）：", [word for word in result if word not in stopWords])

# Remove Stop Words from Files
stopWords = set()
with open(StopWords_File, "rt", encoding="utf-8") as f:
  for line in f:
    line = line.strip() # Remove trailing \n
    stopWords.add(line)
print("中文斷詞（更精簡）：", [word for word in result if word not in stopWords])

Building prefix dict from /content/dict.txt.big ...
DEBUG:jieba:Building prefix dict from /content/dict.txt.big ...
Dumping model to file cache /tmp/jieba.u501edca284da514cb68b53a20324f4e3.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.u501edca284da514cb68b53a20324f4e3.cache
Loading model cost 1.436 seconds.
DEBUG:jieba:Loading model cost 1.436 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


中文斷詞（有標點）： ['我', '喜歡', '跑步', '，', '你', '呢', '？']
中文斷詞（無標點）： ['我', '喜歡', '跑步', '你', '呢']
中文斷詞（更精簡）： ['喜歡', '跑步']


# 文字數位化

In [14]:
# Create a Tokenizer object
from tensorflow.keras.preprocessing.text import Tokenizer

tk = Tokenizer(
        num_words=None,
        filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n', # 需要過濾掉的文字(停止詞).
        lower=True,
        split=' ',
        char_level=False, # 是否斷至「字母」層級
        oov_token='NiD' # out of vocabulary token (對照表找不到字時，用'NiD'代替(亦可用其他符號))
    )

In [15]:
# 為專業領域造字典(Corpus)   Note: 超好用！
# Create Mapping by Corpus
corpus = ["I love jogging, and you?",
      "I love reading!"]
tk.fit_on_texts(corpus)

# Show the Mapping Table
print(tk.word_index)    # WORD vs. NUMBER 造字典(文字 對照 數字)
print(tk.index_word)    # NUMBER vs. WORD 造字典(數字 對照 文字)

{'NiD': 1, 'i': 2, 'love': 3, 'jogging': 4, 'and': 5, 'you': 6, 'reading': 7}
{1: 'NiD', 2: 'i', 3: 'love', 4: 'jogging', 5: 'and', 6: 'you', 7: 'reading'}


In [16]:
# Test for Mapping Text into Sequence
input_text = ["I love jogging!",
        "and I love reading, too!"]

seq = tk.texts_to_sequences(input_text) # 將文字轉成數字.
print(seq)

# Test for Mapping Sequence into Text
text = tk.sequences_to_texts(seq) # 將數字轉回文字
print(text)

[[2, 3, 4], [5, 2, 3, 7, 1]]
['i love jogging', 'and i love reading NiD']


# 序列對齊（Sequence Alignment）

In [17]:
# Create a Sequence Padding Object
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_seq = pad_sequences(
        sequences=seq, # 要被對齊的數列
        maxlen=5, # 一律調整成五個字
        dtype="int32",
        padding="pre", # 補零在前面
        truncating="post", # 若句字太長，則放棄後面(和英文語意有關，因為英文習慣先把重要事情放在前面)
        value=0 # 不足的話，一律補零.
    )

print(padded_seq)

[[0 0 2 3 4]
 [5 2 3 7 1]]


# 編碼（Encoding）

In [18]:
# One-Hot Encoding
from tensorflow.keras.utils import to_categorical

print("獨熱編碼 -------------")
print(to_categorical(padded_seq))

獨熱編碼 -------------
[[[1. 0. 0. 0. 0. 0. 0. 0.]
  [1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 1. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 1. 0. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 0. 1.]
  [0. 1. 0. 0. 0. 0. 0. 0.]]]


In [19]:
# Multi-Hot Encoding
print("多熱編碼 -------------")
print(tk.texts_to_matrix(input_text))

多熱編碼 -------------
[[0. 0. 1. 1. 1. 0. 0. 0.]
 [0. 1. 1. 1. 0. 1. 0. 1.]]


In [20]:
# Word Embedding
import tensorflow as tf
from tensorflow.keras import layers

emb = layers.Embedding(8, 3)

# tf.constant(): Convert immediate values into tensor
result = emb(tf.constant(padded_seq))
print("詞向量嵌入 -------------")
print(result.numpy())

詞向量嵌入 -------------
[[[ 0.01349271  0.04462766 -0.0275035 ]
  [ 0.01349271  0.04462766 -0.0275035 ]
  [-0.01734374  0.01170375 -0.01166357]
  [ 0.01787699  0.02774879 -0.02931147]
  [-0.01099999 -0.04230637  0.03668027]]

 [[ 0.04608779 -0.01086671 -0.01030164]
  [-0.01734374  0.01170375 -0.01166357]
  [ 0.01787699  0.02774879 -0.02931147]
  [ 0.01124601 -0.03104751  0.02994614]
  [ 0.03303644  0.0144292  -0.01541664]]]
