In [1]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classification of newsgroup messages into 20 different categories).
GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)
20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
'''

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant


BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR)
TEXT_DATA_DIR = os.path.join(BASE_DIR, 'train.csv')
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.2

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [30]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')
#key为word,value为对应的300维词向量
#np.asarray()

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'sgns.sogou.word.txt')) as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 364181 word vectors.


In [31]:
# second, prepare text samples and their labels
print('Processing text dataset')
texts = []  # list of text samples
labels = []  # list of label ids
with open(TEXT_DATA_DIR) as f:
    for line in f.readlines():
        l = line.split(',')
        if l[0] == 'text':
            continue
        texts.append(l[0])
        labels.append(int(l[-1].strip('\n')))
print('Found %s texts.' % len(texts))

Processing text dataset
Found 120 texts.


In [33]:
print(labels)

[5, 2, 2, 2, 3, 3, 5, 4, 2, 5, 1, 6, 2, 4, 5, 2, 4, 3, 2, 3, 6, 2, 1, 6, 2, 2, 1, 1, 5, 2, 4, 1, 5, 2, 6, 3, 4, 1, 2, 5, 6, 6, 4, 2, 6, 5, 6, 1, 2, 6, 5, 4, 2, 6, 1, 3, 6, 3, 1, 3, 5, 4, 2, 4, 6, 1, 1, 3, 6, 3, 6, 4, 4, 6, 6, 3, 2, 1, 1, 3, 5, 4, 3, 5, 5, 3, 6, 5, 5, 6, 6, 5, 5, 6, 1, 4, 1, 3, 4, 4, 4, 4, 4, 3, 3, 1, 5, 3, 1, 5, 5, 3, 1, 2, 2, 1, 4, 1, 4, 3]


In [34]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
labels = to_categorical(np.asarray(labels))

print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]
print(x_train.shape)

Found 2615 unique tokens.
Shape of data tensor: (120, 1000)
Shape of label tensor: (120, 7)
(96, 1000)


In [36]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.


In [41]:
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
print(sequence_input.shape)
embedded_sequences = embedding_layer(sequence_input)
print(embedded_sequences.shape)
#每一篇文章也即每一个样本都是1000*300维，1000表示1000个单词，300表示每个单词的维度。
#128个5*100的一维filter 输入：1000*300 输出：996（1000-5+1）*128
x = Conv1D(128, 5, activation='relu')(embedded_sequences)
print(x.shape)
x = MaxPooling1D(5)(x) #996/5
print(x.shape)
x = Conv1D(128, 5, activation='relu')(x)
print(x.shape)
x = MaxPooling1D(5)(x)
print(x.shape)
x = Conv1D(128, 5, activation='relu')(x)
print(x.shape)
x = GlobalMaxPooling1D()(x)
print(x.shape)
x = Dense(128, activation='relu')(x)
print(x.shape)
preds = Dense(7, activation='softmax')(x)
print(preds.shape)

model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

model.fit(x_train, y_train,
          batch_size=8,
          epochs=100,
          validation_data=(x_val, y_val))


Training model.
(?, 1000)
(?, 1000, 300)
(?, 996, 128)
(?, 199, 128)
(?, 195, 128)
(?, 39, 128)
(?, 35, 128)
(?, 128)
(?, 128)
(?, 7)
Train on 96 samples, validate on 24 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100

KeyboardInterrupt: 