In [None]:
from __future__ import print_function
import os, sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
    
import keras
import KerasTools as KT
import numpy as np

In [None]:
max_features = 10000 # Top most frequent words to consider
maxlen = 200       # Cut texts after this number of words

print('Load data...')
(train_data, train_labels), (test_data, test_labels) = keras.datasets.reuters.load_data(num_words=max_features)

In [None]:
def vectorize_sequences(sequences, dimension=10000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results
x_train = vectorize_sequences(train_data)
y_train = keras.utils.to_categorical(train_labels)
x_test = vectorize_sequences(test_data)
y_test = keras.utils.to_categorical(test_labels)
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)
print('x_test shape:', x_test.shape)
print('y_test shape:', y_test.shape)

In [None]:
print('Pad sequences (samples x time)')
x_train = keras.preprocessing.sequence.pad_sequences(train_data, maxlen=maxlen)
y_train = keras.utils.to_categorical(train_labels)
x_test = keras.preprocessing.sequence.pad_sequences(test_data, maxlen=maxlen)
y_test = keras.utils.to_categorical(test_labels)
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)
print('x_test shape:', x_test.shape)
print('y_test shape:', y_test.shape)

In [None]:
import collections
mapping = collections.Counter(train_labels)
weights = {k: float(len(train_labels)) / float((len(mapping)) * mapping[k]) for k in mapping}

In [None]:
print('Build model...')
model = keras.models.Sequential()
#model.add(keras.layers.Embedding(max_features, 16, input_shape=(maxlen, ), mask_zero=False))
#model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(46, activation='relu', input_shape=(max_features,)))
#model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(46, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.RMSprop(),
              metrics=['accuracy'])
model.summary()

In [None]:
print('Train...')
history = model.fit(x_train, y_train, class_weight = weights,
          batch_size=256, epochs=25, validation_split=0.1)

In [None]:
KT.plot_history(history.history)

In [None]:
print('Build and train final model...')
model = keras.models.Sequential()
#model.add(keras.layers.Embedding(max_features, 16, input_shape=(maxlen, ), mask_zero=False))
#model.add(keras.layers.Flatten())
#model.add(keras.layers.Dense(64, activation='relu'))
model.add(keras.layers.Dense(46, activation='relu', input_shape=(max_features,)))
model.add(keras.layers.Dense(46, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer=keras.optimizers.RMSprop(),
              metrics=['accuracy'])
final_epochs = 8
model.fit(x_train, y_train, batch_size=256, epochs=final_epochs, class_weight = weights)
test_loss, test_acc = model.evaluate(x_test, y_test)

In [None]:
history.history['test_loss'] = test_loss
history.history['test_acc'] = test_acc
history.history['epochs'] = final_epochs
KT.plot_history(history.history)

In [None]:
model.save("reuters.h5")

In [None]:
raw_word_index = keras.datasets.reuters.get_word_index()
word_index = {v+3:k for k,v in raw_word_index.items()}
word_index[0] = '-PAD-'
word_index[1] = '-START-'
word_index[2] = '-UNK-'

# Reconstruct test data entry as string
entry = 301
print(" ".join(word_index.get(w, 2) for w in test_data[entry]))
result = model.predict(x_test[entry:entry+1])
prediction = KT.datasets.decode.decode_predictions('reuters', result, top=5)

print('-' * 32)
print(" Real newswire category: {}".format(test_labels[entry]))
print('-' * 32)
print(" Prediction:")
print('-' * 32)
print("{0:>6} | {1:>12} | {2:>6} ".format("Class", "Label", "Score"))
print('-' * 32)
for p in prediction[0]:
    print("{0:>6} | {1:>12} | {2:>6.2f}% ".format(*p))
print('-' * 32)