In [1]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classification of newsgroup messages into 20 different categories).
GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)
20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
'''

from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant


BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR)
TEXT_DATA_DIR = os.path.join(BASE_DIR, '20_newsgroup')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')
#key为word,value为对应的100维词向量
#np.asarray()

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [3]:
# second, prepare text samples and their labels
print('Processing text dataset')
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                args = {} if sys.version_info < (3,) else {'encoding': 'latin-1'}
                with open(fpath, **args) as f:
                    t = f.read()
                    i = t.find('\n\n')  # skip header
                    if 0 < i:
                        t = t[i:]
                    texts.append(t)
                labels.append(label_id)

print('Found %s texts.' % len(texts))
print(texts[:2])
print(len(labels_index))
print(labels_index.items())
print(len(labels))
print(labels[:1002])

Processing text dataset
Found 19997 texts.
['\n\nArchive-name: atheism/resources\nAlt-atheism-archive-name: resources\nLast-modified: 11 December 1992\nVersion: 1.0\n\n                              Atheist Resources\n\n                      Addresses of Atheist Organizations\n\n                                     USA\n\nFREEDOM FROM RELIGION FOUNDATION\n\nDarwin fish bumper stickers and assorted other atheist paraphernalia are\navailable from the Freedom From Religion Foundation in the US.\n\nWrite to:  FFRF, P.O. Box 750, Madison, WI 53701.\nTelephone: (608) 256-8900\n\nEVOLUTION DESIGNS\n\nEvolution Designs sell the "Darwin fish".  It\'s a fish symbol, like the ones\nChristians stick on their cars, but with feet and the word "Darwin" written\ninside.  The deluxe moulded 3D plastic fish is $4.95 postpaid in the US.\n\nWrite to:  Evolution Designs, 7119 Laurel Canyon #4, North Hollywood,\n           CA 91605.\n\nPeople in the San Francisco Bay area can get Darwin Fish from Lynn Gold -

In [4]:
sys.version_info

sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)

In [5]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
print(sequences[:2])
print(len(sequences[0]))
print(len(sequences[1]))

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print(data[:2])
print(len(data[0]))
print(len(data[1]))

print(np.asarray(labels).shape)
labels = to_categorical(np.asarray(labels))
print(labels.shape)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
print(indices[:2])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
print(x_train.shape)

[[1237, 273, 1213, 1439, 1071, 1213, 1237, 273, 1439, 192, 2515, 348, 2964, 779, 332, 28, 45, 1628, 1439, 2516, 3, 1628, 2144, 780, 937, 29, 441, 2770, 8854, 4601, 7969, 11979, 5, 12806, 75, 1628, 19, 229, 29, 1, 937, 29, 441, 2770, 6, 1, 118, 558, 2, 90, 106, 482, 3979, 6602, 5375, 1871, 12260, 1632, 17687, 1828, 5101, 1828, 5101, 788, 1, 8854, 4601, 96, 4, 4601, 5455, 64, 1, 751, 563, 1716, 15, 71, 844, 24, 20, 1971, 5, 1, 389, 8854, 744, 1023, 1, 7762, 1300, 2912, 4601, 8, 73, 1698, 6, 1, 118, 558, 2, 1828, 5101, 16500, 13447, 73, 1261, 10982, 170, 66, 6, 1, 869, 2235, 2544, 534, 34, 79, 8854, 4601, 29, 6603, 3388, 264, 1505, 535, 49, 12, 343, 66, 60, 155, 2, 6603, 1043, 1, 427, 8, 73, 1698, 618, 4601, 417, 1628, 632, 11716, 4602, 814, 1628, 691, 3, 1, 467, 2163, 3, 2266, 7491, 5, 48, 15, 40, 135, 378, 8, 1, 467, 6359, 30, 101, 90, 1781, 5, 115, 101, 417, 1628, 632, 17061, 1448, 4317, 45, 860, 73, 1611, 2455, 3343, 467, 7491, 13132, 5814, 1301, 1781, 1, 467, 9477, 667, 11716, 323, 1

[[  58  576    3 ...    4  930 2050]
 [ 221   31  972 ... 2932  552  324]]
1000
1000
(19997,)
(19997, 20)
Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)
[0 1]
(15998, 1000)


In [6]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.


In [10]:
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
print(sequence_input.shape)
embedded_sequences = embedding_layer(sequence_input)
print(embedded_sequences.shape)
#每一篇文章也即每一个样本都是1000*100维，1000表示1000个单词，100表示每个单词的维度。
#128个5*100的一维filter 输入：1000*100 输出：996（1000-5+1）*128
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
print(x.shape)
x = MaxPooling1D(5)(x) #996/5
print(x.shape)
x = Conv1D(128, 5, activation='relu')(x)
print(x.shape)
x = MaxPooling1D(5)(x)
print(x.shape)
x = Conv1D(128, 5, activation='relu')(x)
print(x.shape)
x = GlobalMaxPooling1D()(x)
print(x.shape)
x = Dense(128, activation='relu')(x)
print(x.shape)
preds = Dense(len(labels_index), activation='softmax')(x)
print(preds.shape)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val))


Training model.
(?, 1000)
(?, 1000, 100)
(?, 996, 128)
(?, 199, 128)
(?, 195, 128)
(?, 39, 128)
(?, 35, 128)
(?, 128)
(?, 128)
(?, 20)
Train on 15998 samples, validate on 3999 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0xb231d1908>