In [11]:
import os

imdb_dir = 'IMDB/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
                



In [12]:
## 对数据进行分词
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100  #在100个词后截断
training_samples = 200  #在200个样本上训练
validation_samples = 10000  #验证样本
max_words = 10000  #只考虑数据集中前10000个最常见的单词

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:{}'.format(data.shape))
print('Shape of label tensor:{}'.format(labels.shape))

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples:training_samples + validation_samples]


Found 88582 unique tokens.
Shape of data tensor:(25000, 100)
Shape of label tensor:(25000,)


#### https://nlp.stanford.edu/projects/glove/ 下载glove.6B.zip，里面包含400 000 个单词的100维嵌入向量。解压文件

In [13]:
glove_dir = 'IMDB/glove.6B'

embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found {} word vectors.'.format(len(embeddings_index)))


Found 400000 word vectors.


In [14]:
##构建嵌入矩阵,准备glove词嵌入矩阵

embedding_dim = 100  #维向量，与单词索引一一对应

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [15]:
#定义模型
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [16]:
#在模型中加载glove嵌入
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False



In [17]:
#编译并训练
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_data=(x_val, y_val))

model.save_weights('pre_trained_glove_model.h5')


Train on 200 samples, validate on 10000 samples
Epoch 1/10


 32/200 [===>..........................] - ETA: 3s - loss: 0.7437 - acc: 0.5312



Epoch 2/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.5821 - acc: 0.6875



Epoch 3/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.4894 - acc: 0.7188



Epoch 4/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.4153 - acc: 0.9375



Epoch 5/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.2327 - acc: 0.9062



Epoch 6/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.5111 - acc: 0.7188



Epoch 7/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.0949 - acc: 1.0000



Epoch 8/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.0634 - acc: 1.0000



Epoch 9/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.0630 - acc: 1.0000



Epoch 10/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.0369 - acc: 1.0000



In [19]:
##绘制结果
import matplotlib.pyplot as plt

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()

plt.show()
