In [None]:
import os, sys
import keras
import numpy as np
import collections
import statistics

import tensorflow as tf
import tensorflow.keras as keras

tf.get_logger().setLevel('ERROR')

In [None]:
# Step 1

# Load the Reuters dataset, Dataset of 11,228 newswires from Reuters, labeled over 46 topics.
# 
# It is split into training and test set of 8982 / 2246 entries.
# The words are tokenized into integer encoding.

(train_data, train_labels), (test_data, test_labels) = keras.datasets.reuters.load_data()

print(train_data.shape)
print(test_data.shape)
print(max(len(e) for e in train_data))

In [None]:
# Step 2

# Show raw data of training set entry '0'
# train_data: Tokenized newswire entry
# train_label: Category of newswire

print("Raw training entry No 0: {}".format(train_data[0]))
print
print("Raw training label No 0: '{}'".format(train_labels[0]))

In [None]:
# Step 3

# Use the tokenizer word encoding dictionary to reconstruct the original review text.
#
# Note: Tokens 0, 1, and 2 are reserved for 'padding', 'start of sequence', and 'unknown word'

raw_word_index = keras.datasets.reuters.get_word_index()
word_index = {v+3:k for k,v in raw_word_index.items()}
word_index[0] = '-PAD-'
word_index[1] = '-START-'
word_index[2] = '-UNK-'

# Reconstruct train data entry as string
entry = 202
print("Newswire category: {}".format(train_labels[entry]))
print(" ".join(word_index.get(w, 2) for w in train_data[entry]))

In [None]:
# Step 4

# Sort newswire texts according to class labels
# and print all of a certain category

category = 5

train_elabels = [(c, i) for i,c in enumerate(train_labels)]
cat = [e[1] for e in filter(lambda x: x[0]==category, train_elabels)]
print("Number of entries for category {}: {}".format(category, len(cat)))
print()
for c in cat:
    print(" ".join(word_index.get(w, 2) for w in train_data[c]))
    print()

In [None]:
# Step 5

# Generate statistics over all 46 classes

mapping = ['cocoa','grain','veg-oil','earn','acq','wheat','copper','housing','money-supply',
           'coffee','sugar','trade','reserves','ship','cotton','carcass','crude','nat-gas',
           'cpi','money-fx','interest','gnp','meal-feed','alum','oilseed','gold','tin',
           'strategic-metal','livestock','retail','ipi','iron-steel','rubber','heat','jobs',
           'lei','bop','zinc','orange','pet-chem','dlr','gas','silver','wpi','hog','lead']

train_count = collections.Counter(train_labels)
test_count = collections.Counter(test_labels)
total_words = [statistics.mean([len(e) for e in train_data[train_labels.flatten() == i]]) for i in range(46)]

print("{:5s} {:20s} {:5s} {:5s}  {:7s}".format(" "    ," "         , "Nr of", "docs", "Mean nr of words"))
print("{:5s} {:20s} {:5s}  {:5s} {:7s}".format("Index","Class name", "train", "test", "in train set"))
for i in range(46):
    print("{:5d} {:20s} {:5d} {:5d}   {:6.2f}".format(i,mapping[i], train_count[i], test_count[i], total_words[i]))