[参考](https://qiita.com/nymwa/items/4542b45837a10766890b)  
言語処理100本ノック 2020 第9章: RNN, CNN   
@nymwa

# 80

In [1]:
import re
import spacy
import tensorflow as tf
from collections import Counter

In [2]:
nlp = spacy.load('en')
categories = ['b', 't', 'e', 'm']
category_names = ['business', 'science and technology', 'entertainment', 'health']

In [3]:
def tokenize(x):
    x = re.sub(r'\s+', ' ', x)
    x = nlp.make_doc(x)
    x = [d.text for d in x]
    return x

def read_feature_dataset(filename):
    with open(filename) as f:
        dataset = f.read().splitlines()
    dataset = [line.split('\t') for line in dataset]
    t_index = [categories.index(line[0]) for line in dataset]
    dataset_t = []
    for index in t_index:
        label = [0]*4
        label[index] = 1
        dataset_t.append(label)
    dataset_x = [tokenize(line[1]) for line in dataset]
    return dataset_x, dataset_t

In [4]:
train_x, train_t = read_feature_dataset('data/train.txt')
valid_x, valid_t = read_feature_dataset('data/valid.txt')
test_x, test_t = read_feature_dataset('data/test.txt')

In [5]:
counter = Counter([
    x
    for sent in train_x
    for x in sent
])

vocab_in_train = [
    token
    for token, freq in counter.most_common()
    if freq > 1
]
len(vocab_in_train)

9700

In [6]:
vocab_in_train

['-',
 "'",
 'to',
 ',',
 '...',
 "'s",
 'in',
 'on',
 'UPDATE',
 ':',
 'as',
 'of',
 'for',
 'The',
 'US',
 'To',
 'the',
 'and',
 '$',
 '"',
 'In',
 'Of',
 'at',
 'a',
 '(',
 ')',
 'With',
 'Is',
 'For',
 'A',
 'And',
 'with',
 'after',
 'New',
 '?',
 'Kardashian',
 ';',
 'On',
 'China',
 'up',
 'by',
 'Kim',
 'After',
 'says',
 '1',
 'At',
 'is',
 'STOCKS',
 '!',
 'Fed',
 'From',
 'new',
 "n't",
 'ECB',
 '2',
 'from',
 'her',
 'Wall',
 'It',
 'shares',
 'Says',
 'FOREX',
 'data',
 'First',
 'About',
 'Miley',
 'Cyrus',
 'Euro',
 'CEO',
 'over',
 'West',
 'Dollar',
 'You',
 'St',
 'she',
 'May',
 'bln',
 'Chris',
 'Over',
 'Ukraine',
 'Will',
 'Stocks',
 'More',
 'Kanye',
 'As',
 'Be',
 'Up',
 'Are',
 'Google',
 'be',
 '.',
 'Justin',
 'Bieber',
 'off',
 'Billion',
 'I',
 'Star',
 'profit',
 'euro',
 '2014',
 'GLOBAL',
 'Bank',
 'are',
 'Time',
 'out',
 'RPT',
 'but',
 'Not',
 'How',
 'sales',
 'deal',
 'it',
 'more',
 'That',
 'UK',
 'that',
 'Day',
 'Gold',
 'pct',
 'What',
 'Apple

In [7]:
vocab_list = ['[UNK]'] + vocab_in_train
vocab_dict = {x:n for n, x in enumerate(vocab_list)}

In [8]:
vocab_dict

{'[UNK]': 0,
 '-': 1,
 "'": 2,
 'to': 3,
 ',': 4,
 '...': 5,
 "'s": 6,
 'in': 7,
 'on': 8,
 'UPDATE': 9,
 ':': 10,
 'as': 11,
 'of': 12,
 'for': 13,
 'The': 14,
 'US': 15,
 'To': 16,
 'the': 17,
 'and': 18,
 '$': 19,
 '"': 20,
 'In': 21,
 'Of': 22,
 'at': 23,
 'a': 24,
 '(': 25,
 ')': 26,
 'With': 27,
 'Is': 28,
 'For': 29,
 'A': 30,
 'And': 31,
 'with': 32,
 'after': 33,
 'New': 34,
 '?': 35,
 'Kardashian': 36,
 ';': 37,
 'On': 38,
 'China': 39,
 'up': 40,
 'by': 41,
 'Kim': 42,
 'After': 43,
 'says': 44,
 '1': 45,
 'At': 46,
 'is': 47,
 'STOCKS': 48,
 '!': 49,
 'Fed': 50,
 'From': 51,
 'new': 52,
 "n't": 53,
 'ECB': 54,
 '2': 55,
 'from': 56,
 'her': 57,
 'Wall': 58,
 'It': 59,
 'shares': 60,
 'Says': 61,
 'FOREX': 62,
 'data': 63,
 'First': 64,
 'About': 65,
 'Miley': 66,
 'Cyrus': 67,
 'Euro': 68,
 'CEO': 69,
 'over': 70,
 'West': 71,
 'Dollar': 72,
 'You': 73,
 'St': 74,
 'she': 75,
 'May': 76,
 'bln': 77,
 'Chris': 78,
 'Over': 79,
 'Ukraine': 80,
 'Will': 81,
 'Stocks': 82,
 'Mo

In [9]:
def sent_to_ids(sent):
    return tf.constant([vocab_dict[x if x in vocab_dict else '[UNK]'] for x in sent], dtype=tf.int64)

In [10]:
print(train_x[0])
print(sent_to_ids(train_x[0]).numpy())

['White', 'House', 'Science', 'Fair', 'Will', 'Focus', 'On', 'Girls', 'In', 'STEM']
[ 482  454 3026 1563   81 2095   38  941   21    0]


In [11]:
def dataset_to_ids(dataset):
    return [sent_to_ids(x) for x in dataset]

In [12]:
train_s = dataset_to_ids(train_x)
valid_s = dataset_to_ids(valid_x)
test_s = dataset_to_ids(test_x)

# 81

In [13]:
max_len = 0
for x in train_s:
    if x.shape[0] > max_len:
        max_len = x.shape[0]
for x in valid_s:
    if x.shape[0] > max_len:
        max_len = x.shape[0]
max_len

25

In [14]:
def padding_zero(data, max_len):
    ret = []
    for i in range(len(data)):
        zero = [0] * (max_len - data[i].shape[0])
        ret.append(tf.concat([data[i], zero], 0))
    return tf.stack(ret)

In [15]:
train_s = padding_zero(train_s, max_len)
train_t = tf.stack(train_t)
valid_s = padding_zero(valid_s, max_len)
valid_t = tf.stack(valid_t)
test_s = padding_zero(test_s, max_len)
test_t = tf.stack(test_t)

In [16]:
train_s.shape

TensorShape([10684, 25])

In [17]:
train_t.shape

TensorShape([10684, 4])

In [18]:
def loss(t, y):
    return tf.keras.backend.mean(tf.keras.losses.categorical_crossentropy(t,y))
def acc(t,y):
    return tf.keras.backend.mean(tf.keras.metrics.categorical_accuracy(t,y))

In [19]:
class RNN81(tf.keras.Model):
    def __init__(self, v_size, e_size, h_size, c_size, dropout=0.2):
        super().__init__()
        self.emb = tf.keras.layers.Embedding(
            input_dim=v_size, 
            output_dim=e_size,
            mask_zero=True
        )
        self.rnn = tf.keras.layers.LSTM(
            h_size
        )
        self.out = tf.keras.layers.Dense(
            c_size
        )

    def call(self, x):
        x = self.emb(x)
        x = self.rnn(x)
        y = tf.nn.softmax(self.out(x))
        return y

In [20]:
model = RNN81(len(vocab_dict), 300, 50, 4)
model.compile(optimizer=tf.keras.optimizers.SGD(learning_rate=0.1),
              loss=loss,
              metrics=['accuracy']
             )
model.build(train_s.shape)
model.summary()

Model: "rn_n81"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  2910300   
_________________________________________________________________
lstm (LSTM)                  multiple                  70200     
_________________________________________________________________
dense (Dense)                multiple                  204       
Total params: 2,980,704
Trainable params: 2,980,704
Non-trainable params: 0
_________________________________________________________________


In [21]:
y = model(test_s)
t = test_t
acc(t, y).numpy()

0.19910179

# 82

In [22]:
!rm -r ./log/82

rm: ./log/82: No such file or directory


In [25]:
tb_cb = tf.keras.callbacks.TensorBoard(
    log_dir='./log/82',
    histogram_freq=1,
    write_images=True
)

In [26]:
%reload_ext tensorboard
%tensorboard --logdir log/82 --bind_all --reload_multifile true

Reusing TensorBoard on port 6007 (pid 35563), started 0:00:35 ago. (Use '!kill 35563' to kill it.)

In [27]:
history = model.fit(train_s,train_t,
                    epochs=10, 
                    batch_size=128,
                    validation_data=(valid_s, valid_t),
                    verbose=1,
                    callbacks=[tb_cb]
                   )

Train on 10684 samples, validate on 1336 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
print('学習データでの正解率 :', acc(train_t, model(train_s)).numpy())
print('評価データでの正解率 :', acc(test_t, model(test_s)).numpy())

学習データでの正解率 : 0.7921191
評価データでの正解率 : 0.78443116
