### IMDB Movie reviews sentiment classification
Dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative). 

Reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers). For convenience, words are indexed by overall frequency in the dataset, so that for instance the integer "3" encodes the 3rd most frequent word in the data. This allows for quick filtering operations such as: "only consider the top 10,000 most common words, but eliminate the top 20 most common words".

As a convention, "0" does not stand for a specific word, but instead is used to encode any unknown word.

In [1]:
'''
#Train a recurrent convolutional network on the IMDB sentiment classification task.
Gets to 0.8498 test accuracy after 2 epochs. 41 s/epoch on K520 GPU.
'''
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.datasets import imdb

Using TensorFlow backend.


In [2]:
# Embedding
max_features = 20000
maxlen = 100
embedding_size = 128

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 2

## Note:  batch_size is highly sensitive. Only 2 epochs are needed as the dataset is very small.##

In [41]:
## The length of each sample is different, len(x_train[24999]) = 153 and len(x_train[1])=189
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train))
print(len(x_test))
x_train.shape

25000
25000


(25000,)

In [42]:
print(x_train[1])

[1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463, 4369, 5012, 134, 26, 4, 715, 8, 118, 1634, 14, 394, 20, 13, 119, 954, 189, 102, 5, 207, 110, 3103, 21, 14, 69, 188, 8, 30, 23, 7, 4, 249, 126, 93, 4, 114, 9, 2300, 1523, 5, 647, 4, 116, 9, 35, 8163, 4, 229, 9, 340, 1322, 4, 118, 9, 4, 130, 4901, 19, 4, 1002, 5, 89, 29, 952, 46, 37, 4, 455, 9, 45, 43, 38, 1543, 1905, 398, 4, 1649, 26, 6853, 5, 163, 11, 3215, 10156, 4, 1153, 9, 194, 775, 7, 8255, 11596, 349, 2637, 148, 605, 15358, 8003, 15, 123, 125, 68, 2, 6853, 15, 349, 165, 4362, 98, 5, 4, 228, 9, 43, 2, 1157, 15, 299, 120, 5, 120, 174, 11, 220, 175, 136, 50, 9, 4373, 228, 8255, 5, 2, 656, 245, 2350, 5, 4, 9837, 131, 152, 491, 18, 2, 32, 7464, 1212, 14, 9, 6, 371, 78, 22, 625, 64, 1382, 9, 8, 168, 145, 23, 4, 1690, 15, 16, 4, 1355, 5, 28, 6, 52, 154, 462, 33, 89, 78, 285, 16, 145, 95]


In [43]:
print(len(x_train[24999]))
print(len(x_train[1]))

153
189


In [44]:
print(x_train[1][90:99])

[11, 3215, 10156, 4, 1153, 9, 194, 775, 7]


In [45]:
## use pad_sequences to push the samples with different lengths to the same length, i.e., maxlen = 100
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print(x_train.shape)
print(x_test.shape)

Pad sequences (samples x time)
(25000, 100)
(25000, 100)


In [46]:
print(len(x_train[1]))

100


In [47]:
print(x_train[1])

[  163    11  3215 10156     4  1153     9   194   775     7  8255 11596
   349  2637   148   605 15358  8003    15   123   125    68     2  6853
    15   349   165  4362    98     5     4   228     9    43     2  1157
    15   299   120     5   120   174    11   220   175   136    50     9
  4373   228  8255     5     2   656   245  2350     5     4  9837   131
   152   491    18     2    32  7464  1212    14     9     6   371    78
    22   625    64  1382     9     8   168   145    23     4  1690    15
    16     4  1355     5    28     6    52   154   462    33    89    78
   285    16   145    95]


In [53]:
model = Sequential()
# max_features = 20000, maxlen = 100, embedding_size = 128
# kernel_size = 5, filters = 64, pool_size = 4
# The output of the Embedding layer is a 2D vector with one embedding for each word 
#  in the input sequence of words (input document).
model.add(Embedding(max_features, embedding_size, input_length = maxlen))
model.add(Dropout(0.25))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size)) # lstm_output_size = 70
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2
Test score: 0.33556033623814585
Test accuracy: 0.8554400205612183


In [55]:
model.summary() # 2560000 = 20000 * 128, 41024 = 64*(5*128 +1)

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 96, 64)            41024     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 24, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 70)                37800     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 71        
_________________________________________________________________
activation_1 (Activation)    (None, 1)                

#### <font color='red'>Question 1: what is the difference between CNN and LSTM ?</font>

#### <font color='red'>Question 2: Can we use Conv1D to train CNN?</font>

### How to Use Word Embedding Layers for Deep Learning with Keras
- Word Embedding
- Keras Embedding Layer

copy from https://machinelearningmastery.com/use-word-embedding-layers-deep-learning-keras/

In [118]:
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

In [119]:
e = Embedding(200, 32, input_length=50)

In [120]:
docs = ['Well done!',
		'Good work',
		'Great effort',
		'nice work',
		'Excellent!',
		'Weak',
		'Poor effort!',
		'not good',
		'poor work',
		'Could have done better.']
# define class labels
labels = np.array([1,1,1,1,1,0,0,0,0,0])
labels.shape

(10,)

In [121]:
# integer encode the documents
vocab_size = 100
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)  
# integer encoded documents,different samples are with different lengths, while vocabulary size = 50

[[57, 66], [2, 39], [60, 72], [5, 39], [4], [67], [18, 72], [34, 2], [18, 39], [75, 54, 66, 79]]


In [122]:
max_length = 6
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
## all samples are with the uniform length

[[57 66  0  0  0  0]
 [ 2 39  0  0  0  0]
 [60 72  0  0  0  0]
 [ 5 39  0  0  0  0]
 [ 4  0  0  0  0  0]
 [67  0  0  0  0  0]
 [18 72  0  0  0  0]
 [34  2  0  0  0  0]
 [18 39  0  0  0  0]
 [75 54 66 79  0  0]]


In [123]:
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length = max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
## the output of the Embedding layer is a 6×8 matrix 
## it is squashed to a 32-element vector by the Flatten layer.

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 6, 8)              800       
_________________________________________________________________
flatten_7 (Flatten)          (None, 48)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 49        
Total params: 849
Trainable params: 849
Non-trainable params: 0
_________________________________________________________________
None


In [124]:
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)

<keras.callbacks.callbacks.History at 0x1ce077ba5c8>

In [125]:
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 89.999998


In [126]:
predictions = model.predict(padded_docs)

In [129]:
predictions[:9]

array([[0.5415491 ],
       [0.55799013],
       [0.5300171 ],
       [0.55127287],
       [0.5217832 ],
       [0.4837768 ],
       [0.47924432],
       [0.45817178],
       [0.5027293 ]], dtype=float32)

In [130]:
labels[:9]

array([1, 1, 1, 1, 1, 0, 0, 0, 0])

### Exercise 1
    Design a LSTM functional model on Keras dataset Retuers News

In [None]:
from keras.preprocessing import sequence
from keras.layers import Conv1D, MaxPooling1D,LSTM,Dense, Dropout, Activation,Input, Embedding
from keras.datasets import imdb
from keras.models import Model

In [None]:
#load datasets
max_features = 20000
embedding_size = 128
maxlen = 100
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
#x_train = x_train.reshape(x_train.shape[0], x_train.shape[1], 1)
#x_test = x_test.reshape(x_test.shape[0], x_test.shape[1], 1)

print(x_train.shape)
print(x_test.shape)

In [None]:
inputs_shape = Input(shape = (100,))

###############################
##### insert your code herr ###
###############################

model = Model(inputs=inputs_shape,outputs=output)

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')

In [None]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))