In [1]:
import os

imdb_dir = '/home/steve/PycharmProjects/AI_Learning/Python深度学习/data/aclImdb/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)


In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

max_len = 100  # 在100个单词后截断评论
training_samples = 200  # 在200个样本上训练
validation_samples = 10000  # 在10000个样本上进行验证
max_words = 10000  # 只考虑数据集中前10000个最常见的单词

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=max_len)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples:training_samples + validation_samples]
y_val = labels[training_samples:training_samples + validation_samples]


Found 88582 unique tokens.


Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


In [4]:
glove_dir = '/home/steve/PycharmProjects/AI_Learning/Python深度学习/data/glove.6B'

embedding_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embedding_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embedding_index))


Found 400000 word vectors.


In [12]:
word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'br': 7,
 'in': 8,
 'it': 9,
 'i': 10,
 'this': 11,
 'that': 12,
 'was': 13,
 'as': 14,
 'for': 15,
 'with': 16,
 'movie': 17,
 'but': 18,
 'film': 19,
 'on': 20,
 'not': 21,
 'you': 22,
 'are': 23,
 'his': 24,
 'have': 25,
 'he': 26,
 'be': 27,
 'one': 28,
 'all': 29,
 'at': 30,
 'by': 31,
 'an': 32,
 'they': 33,
 'who': 34,
 'so': 35,
 'from': 36,
 'like': 37,
 'her': 38,
 'or': 39,
 'just': 40,
 'about': 41,
 "it's": 42,
 'out': 43,
 'has': 44,
 'if': 45,
 'some': 46,
 'there': 47,
 'what': 48,
 'good': 49,
 'more': 50,
 'when': 51,
 'very': 52,
 'up': 53,
 'no': 54,
 'time': 55,
 'she': 56,
 'even': 57,
 'my': 58,
 'would': 59,
 'which': 60,
 'only': 61,
 'story': 62,
 'really': 63,
 'see': 64,
 'their': 65,
 'had': 66,
 'can': 67,
 'were': 68,
 'me': 69,
 'well': 70,
 'than': 71,
 'we': 72,
 'much': 73,
 'been': 74,
 'bad': 75,
 'get': 76,
 'will': 77,
 'do': 78,
 'also': 79,
 'into': 80,
 'people': 81,
 'other': 82,
 '

In [5]:
len(embedding_index['the'])
# 即词向量的维度
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [6]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487001,  0.72812003, ..., -0.1459    ,
         0.82779998,  0.27061999],
       [-0.071953  ,  0.23127   ,  0.023731  , ..., -0.71894997,
         0.86894   ,  0.19539   ],
       ...,
       [ 0.83069998,  0.036287  ,  0.33054   , ..., -0.64990997,
         0.18334   ,  0.27353999],
       [ 1.15460002,  0.74304003,  0.17122   , ...,  0.46252999,
         0.24988   ,  0.27217999],
       [ 0.34391001, -0.32934999, -0.30296001, ...,  0.45280001,
         1.00779998, -0.41260999]])

In [7]:
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 100)          1000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                320032    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 1,320,065
Trainable params: 1,320,065
Non-trainable params: 0
_________________________________________________________________


In [8]:
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False


In [9]:
model.compile(optimizer='rmsprop',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(x_train, y_train, epochs=10, batch_size=32,
                    validation_data=(x_val, y_val))

model.save_weights('/home/steve/PycharmProjects/AI_Learning/Python深度学习/model/pre_trained_glove_model.h5')


Train on 200 samples, validate on 10000 samples
Epoch 1/10


 32/200 [===>..........................] - ETA: 7s - loss: 0.7892 - acc: 0.5312





Epoch 2/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.6409 - acc: 0.6250



Epoch 3/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.3926 - acc: 0.8750



Epoch 4/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.1839 - acc: 1.0000



Epoch 5/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.1672 - acc: 0.9688



Epoch 6/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.1506 - acc: 1.0000



Epoch 7/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.4370 - acc: 0.7188



Epoch 8/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.0416 - acc: 1.0000



Epoch 9/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.0328 - acc: 1.0000



Epoch 10/10
 32/200 [===>..........................] - ETA: 0s - loss: 0.0225 - acc: 1.0000



In [10]:
test_dir = os.path.join(imdb_dir, 'test')
labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(test_dir, label_type)
    for fname in sorted(os.listdir(dir_name)):
        f = open(os.path.join(dir_name, fname))
        texts.append(f.read())
        f.close()
        if label_type == 'neg':
            labels.append(0)
        else:
            labels.append(1)

sequences = tokenizer.texts_to_sequences(texts)
x_test = pad_sequences(sequences, maxlen=max_len)
y_test = np.asarray(labels)


In [12]:
model.load_weights('/home/steve/PycharmProjects/AI_Learning/Python深度学习/model/pre_trained_glove_model.h5')
model.evaluate(x_test, y_test)


   32/25000 [..............................] - ETA: 2s

 1024/25000 [>.............................] - ETA: 1s

 1760/25000 [=>............................] - ETA: 1s

 2656/25000 [==>...........................] - ETA: 1s

 3744/25000 [===>..........................] - ETA: 1s

 4832/25000 [====>.........................] - ETA: 1s







































[1.1375577805900574, 0.50472]