# 下載路透社資料，已有資料為多則新聞文本，並且已將文字 embedding，同時對應到 46種主題中其中一種。

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

In [3]:
from tensorflow.keras.datasets import reuters
# load data
(tr_data,tr_label),(te_data,te_label) = reuters.load_data(num_words=10000)
print("train data shape",tr_data.shape)
print("train label shape",tr_label.shape)
print("test data shape",te_data.shape)
print("test label shape",te_label.shape)

train data shape (8982,)
train label shape (8982,)
test data shape (2246,)
test label shape (2246,)


In [4]:
print("train 1 --- ",tr_data[0])
print("train Label 1 --- ",tr_label[0])
index_of_word = reuters.get_word_index()
you_index = index_of_word['you']
print("you index --- ",you_index)

train 1 ---  [1, 2, 2, 8, 43, 10, 447, 5, 25, 207, 270, 5, 3095, 111, 16, 369, 186, 90, 67, 7, 89, 5, 19, 102, 6, 19, 124, 15, 90, 67, 84, 22, 482, 26, 7, 48, 4, 49, 8, 864, 39, 209, 154, 6, 151, 6, 83, 11, 15, 22, 155, 11, 15, 7, 48, 9, 4579, 1005, 504, 6, 258, 6, 272, 11, 15, 22, 134, 44, 11, 15, 16, 8, 197, 1245, 90, 67, 52, 29, 209, 30, 32, 132, 6, 109, 15, 17, 12]
train Label 1 ---  3
you index ---  1025


In [5]:
from tensorflow.keras.preprocessing import sequence
# 固定每篇報導的長度為 200 字
word_len = 200
tr_data_new = sequence.pad_sequences(tr_data,maxlen=word_len)
te_data_new = sequence.pad_sequences(te_data,maxlen=word_len)
print("train data new shape",tr_data_new.shape)
print("test data new shape",te_data_new.shape)

train data new shape (8982, 200)
test data new shape (2246, 200)


In [7]:
# 本身已知有46種主題
# 將 label 進行 one hot encode
total_class = 46
tr_encode = tf.one_hot(tr_label,depth=total_class)
te_encode = tf.one_hot(te_label,depth=total_class)

In [10]:
from tensorflow import keras
from tensorflow.keras import layers
model = keras.Sequential()
model.add(layers.Embedding(10000,
                          output_dim=200,
                          input_length=word_len))
model.add(layers.LSTM(128,dropout=0.5,return_sequences=True))
model.add(layers.LSTM(128,dropout=0.5))
model.add(layers.Dense(total_class,activation='softmax'))

In [11]:
batch = 32
epoch = 25
model.compile(
    optimizer= 'rmsprop',
    loss = 'categorical_crossentropy',
    metrics = ['acc']
)
model.fit(tr_data_new,tr_encode,
         epochs=epoch,
         batch_size=batch,
         verbose=2,
         validation_split=0.2)
loss,acc = model.evaluate(te_data_new,te_encode)
print('test acc:',acc)

Epoch 1/25
225/225 - 36s - 159ms/step - acc: 0.4720 - loss: 2.0635 - val_acc: 0.4363 - val_loss: 2.1181
Epoch 2/25
225/225 - 38s - 167ms/step - acc: 0.5503 - loss: 1.7486 - val_acc: 0.5771 - val_loss: 1.6808
Epoch 3/25
225/225 - 37s - 166ms/step - acc: 0.5891 - loss: 1.6158 - val_acc: 0.5838 - val_loss: 1.6759
Epoch 4/25
225/225 - 37s - 166ms/step - acc: 0.6206 - loss: 1.4981 - val_acc: 0.6016 - val_loss: 1.5385
Epoch 5/25
225/225 - 38s - 170ms/step - acc: 0.6349 - loss: 1.4233 - val_acc: 0.6038 - val_loss: 1.5684
Epoch 6/25
225/225 - 39s - 174ms/step - acc: 0.6548 - loss: 1.3294 - val_acc: 0.6372 - val_loss: 1.4198
Epoch 7/25
225/225 - 38s - 169ms/step - acc: 0.6692 - loss: 1.2646 - val_acc: 0.6366 - val_loss: 1.3867
Epoch 8/25
225/225 - 38s - 167ms/step - acc: 0.6841 - loss: 1.2012 - val_acc: 0.6672 - val_loss: 1.3474
Epoch 9/25
225/225 - 38s - 168ms/step - acc: 0.7040 - loss: 1.1277 - val_acc: 0.6644 - val_loss: 1.3600
Epoch 10/25
225/225 - 41s - 182ms/step - acc: 0.7194 - loss: 1.0

array([[   0,    0,    0, ...,  510,   17,   12],
       [   4,   96, 1043, ...,  760,   17,   12],
       [   7, 2775,   33, ...,    8,   17,   12],
       ...,
       [   0,    0,    0, ...,   11,   17,   12],
       [   0,    0,    0, ...,    8,   17,   12],
       [   4,  377,  101, ..., 1523,   17,   12]],
      shape=(2246, 200), dtype=int32)