https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

https://github.com/fchollet/keras/blob/master/examples/pretrained_word_embeddings.py

In [7]:
from __future__ import print_function
import os,sys
import numpy as np
from tqdm import tqdm
np.random.seed(1337)

In [8]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

In [9]:
BASE_DIR = 'data'
GLOVE_DIR = BASE_DIR + '/glove.6B/'
TEXT_DATA_DIR = BASE_DIR + '/20_newsgroup/'
MAX_SEQ_LEN = 1000
MAX_NB_WORDS = 20000 # top 20k most freq words
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

## Build index mapping

In [10]:
embedding_index = {} # maps words to its embedding vector
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in tqdm(f):
        vals = line.split()
        word = vals[0]
        embedding_index[word] = np.asarray(vals[1:], dtype='float')
print('found %d word vectors.' % len(embedding_index))

400000it [00:07, 56653.86it/s]

found 400000 word vectors.





## Prepare text samples

In [12]:
texts = [] # text bodies
labels_index = {} # maps label name to label id
labels = [] # label (ids)
for categ_name in tqdm(sorted(os.listdir(TEXT_DATA_DIR))): 
    path = os.path.join(TEXT_DATA_DIR, categ_name)
    label_id = len(labels_index)
    labels_index[categ_name] = label_id
    for fname in sorted(os.listdir(path)):
        if fname.isdigit():
            fpath = os.path.join(path, fname)
            with open(fpath) as f:
                texts.append(f.read() )
            labels.append(label_id)
print('found %d texts' % len(texts))

100%|██████████| 20/20 [00:00<00:00, 87.74it/s]

found 19997 texts





## vectorize text into 2d integer tensor

In [13]:
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
seqs = tokenizer.texts_to_sequences(texts) # turn article into list of ids
word_index = tokenizer.word_index # maps word to id

In [14]:
print(word_index['play'], word_index['plays']) # havn't done stemming
# but that's fine because the globe havn't done stemming too:
# print(embedding_index['play'], embedding_index['plays'])

665 3990


In [15]:
print('found %s unique tokens.'%len(word_index))
print(MAX_NB_WORDS) # the MAX_NB_WORDS argument in tokenizer didn't work?

found 214909 unique tokens.
20000


In [16]:
# Pads each sequence to the same length: the longest sequence.
data = pad_sequences(sequences=seqs, maxlen=MAX_SEQ_LEN)

# one-hot encoding for labels
labels = to_categorical(np.asarray(labels))
print('shape of data tensor:', data.shape)
print('shape of label tensor:', labels.shape)

shape of data tensor: (19997, 1000)
shape of label tensor: (19997, 20)


In [17]:
# split data
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data, labels = data[indices], labels[indices]
validset_sz = int(VALIDATION_SPLIT*data.shape[0])
X_train, Y_train = data[:-validset_sz], labels[:-validset_sz]
X_val, Y_val = data[-validset_sz:], labels[-validset_sz:]

In [18]:
# embedding matrix: [vec(w) for w in dictionary]
nb_words = min(MAX_NB_WORDS, len(word_index))
embedding_matrix = np.zeros( (nb_words+1, EMBEDDING_DIM) ) 
for word,i in tqdm(word_index.items()):
    if i > MAX_NB_WORDS: continue # in the seqs we only keep most freq 20k words
    embedding_vec = embedding_index.get(word)
    if embedding_vec is not None:
        embedding_matrix[i,:] = embedding_vec

100%|██████████| 214909/214909 [00:00<00:00, 2062388.44it/s]


## training model

In [20]:
sequence_input = Input(shape=(MAX_SEQ_LEN,), dtype='int32') # instantiate a Keras tensor

embedding_layer = Embedding(input_dim=nb_words+1,output_dim=EMBEDDING_DIM,
          weights=[embedding_matrix],
          input_length=MAX_SEQ_LEN, # Length of input sequences
          trainable=False #keep the embeddings fixed
         )
embedded_seqs = embedding_layer(sequence_input)

x = Conv1D(128, 5, activation='relu')(embedded_seqs)
x = MaxPooling1D(5)(x)

x = Conv1D(128, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)

# ??Flatten() gives a function ==> is this similar to Sequential model.add?
x = Flatten()(x) 

x = Dense(128, activation='relu')(x)

preds = Dense( output_dim=len(labels_index), activation='softmax' )(x)

model = Model(input=sequence_input, output=preds)

In [21]:
model.compile(loss='categorical_crossentropy',
             optimizer='rmsprop',
             metrics=['acc'])

In [27]:
model.fit(X_train, Y_train, validation_data=(X_val, Y_val),
         nb_epoch=2, batch_size=128)
# model.evaluate(X_val, Y_val)

Train on 15998 samples, validate on 3999 samples
Epoch 1/2
Epoch 2/2


[0.16258775247770954, 0.94173543363489132]

## use `Sequential` instead of `Model`

之前用的是`Model`的方式构造, 需要指定input和output, 然后层与层之间用: 

    next_layer_out = layer_type(layer_param)(last_layer_out)

的方式进行连接, 这种一堆layer构成的sequential的模型('linear stack of layers').

另外这种模型完全可以用`Sequential()`代替, 只要不断`add`即可:

    model = Sequential()
    model.add( layer_type(layer_param) ) # 1st layer need input_dim param
    model.add( layer_type(layer_param) )
    ...

In [28]:
from keras.models import Sequential

In [29]:
model2 = Sequential()

model2.add(embedding_layer)
model2.add(Conv1D(128, 5, activation='relu'))
model2.add(MaxPooling1D(5))
model2.add(Conv1D(128, 5, activation='relu'))
model2.add(MaxPooling1D(5))
model2.add(Flatten())
model2.add(Dense(output_dim=len(labels_index), activation='softmax'))

In [30]:
model2.compile(loss='categorical_crossentropy',
             optimizer='rmsprop',
             metrics=['acc'])

In [32]:
model2.fit(X_train, Y_train, validation_data=(X_val, Y_val),
         nb_epoch=2, batch_size=128)

Train on 15998 samples, validate on 3999 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff296947a10>

以上用`add`的写法还有更简单的形式: 直接在`Sequential()`里传入一个list, 里面是要添加的layer. 

    model = Sequential([
        layer_type(layer_param) # 1st layer need input_dim param
        layer_type(layer_param)
        ...
        ])

In [40]:
model3 = Sequential([
        embedding_layer,
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(5),
        Conv1D(128, 5, activation='relu'),
        MaxPooling1D(5),
        Flatten(),
        Dense(len(labels_index), activation='softmax')
    ])
model3.compile(loss='categorical_crossentropy',
             optimizer='rmsprop',
             metrics=['acc'])
model3.fit(X_train, Y_train, validation_data=(X_val, Y_val),
         nb_epoch=2, batch_size=128)

Train on 15998 samples, validate on 3999 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ff2960c4c50>