In [1]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### 1. 讀入深度學習套件

In [2]:
from tensorflow.keras.preprocessing import sequence # 把輸入的長度弄成一樣長
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding # Embedding：處理文字用的layer(把文字做one-hot encoding)
from tensorflow.keras.layers import LSTM
from tensorflow.keras.datasets import imdb

### 2. 讀入數據

In [3]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words = 10000) # num_words：常用字的數量

In [4]:
len(x_train)

25000

In [5]:
len(x_test)

25000

In [6]:
x_train[0] # 出現的數字代表該字出現的頻率排名

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

In [7]:
len(x_train[0]) # 第0號評論有幾個字

218

In [8]:
len(x_train[1]) # 第1號評論有幾個字

189

In [9]:
y_train[0] # 1 = 正評

1

In [10]:
y_train[1] # 0 = 負評

0

### 3. 資料處理

In [11]:
### 儘管RNN的輸入可以是不定長度的，還是要先把輸入用成一樣長 ###
x_train = sequence.pad_sequences(x_train, maxlen = 100) # maxlen = 100: 低於100字補0；超過100字把超過的地方去掉
x_test = sequence.pad_sequences(x_test, maxlen = 100)

### 4. 打造一個函數學習機

In [12]:
model = Sequential() # 打開一個空白的函數學習機

In [13]:
model.add(Embedding(10000, 128)) # RNN的激發函數不需要特別設定，用預設的就好

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [14]:
model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))
    # dropout: 避免模型「背答案」，每次學的時候該神經層有多少比例的參數不會去學習
    # recurrent：記憶過去的訓練輸出
    # recurrent_dropout：hidden state的dropout
    # 這裡老師說其實不要輸入128會比較好(不要和上一層的輸出數量一樣)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [15]:
model.add(Dense(1, activation = 'sigmoid')) # 1: 輸出只有正評和負評

In [16]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy']) 
    # adam好處：1.比較快 2.會做momentum(讓他的動量不要做太大的改變，穩穩地走下山)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


In [17]:
model.summary() 
# 1280000 = 10000 * 128 (標準RNN的是線性的，沒有加bias)
# lstm (LSTM) (None, 128) 這裡的128可以改成其他的數字
# (128 + 128 + 1(bias)) * 4(3個gate + 1個類似RNN的處理區(計算cell state的那個部分)) =  131584 

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1280000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________


### 5. 訓練

In [17]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="-1"  

In [18]:
model.fit(x_train, y_train, batch_size = 32, epochs = 10, validation_data = (x_test, y_test)) # 跑RNN有沒有GPU是沒有差異的
# validation_data = (x_test, y_test)：用測試集的資料計算誤差，測試集的資料本身不會拿去訓練

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x20d376a3cc8>

### 6. 儲存模型參數

In [20]:
model.save('myRNNmodel_classExample.h5')

In [24]:
model_json = model.to_json()
open('imbd_model_architecture.json', 'w').write(model_json)
model.save_weights('imbd_model_weights.h5')