In [14]:
# Reference: https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py
from keras.preprocessing import sequence
from keras.models import Sequential, Model, load_model, model_from_yaml
from keras.layers import Dense, Embedding
from keras.layers import LSTM, SpatialDropout1D
from keras.layers import Dense, Dropout, Flatten, Activation
from keras import backend as K
from keras.datasets import imdb
import tensorflow as tf

tf.logging.set_verbosity(tf.logging.ERROR)
import numpy as np
import pandas as pd

from skater.core.local_interpretation.dnni.deep_interpreter import DeepInterpreter
from skater.core.visualizer.text_relevance_visualizer import build_visual_explainer
from skater.util.dataops import convert_dataframe_to_dict, show_in_notebook

In [2]:
# Create a TensorFlow session and register it with Keras. It will use this session to initialize all the variables
sess = tf.Session()
K.set_session(sess)

In [3]:
max_features = 20000
maxlen = 80  # wrap the texts after this number of words (among top max_features most common words)
batch_size = 32
n_epoch = 3

### Load the Dataset
#### IMDB dataset: 
##### 1. http://ai.stanford.edu/~amaas//data/sentiment/
##### 2. http://ai.stanford.edu/~ang/papers/acl11-WordVectorsSentimentAnalysis.pdf ( Section 4.1 )

In [4]:
# The Dataset contains 50,000 reviews(Train:25,000 and Test:25,000)
# More info about the dataset: https://keras.io/datasets/#imdb-movie-reviews-sentiment-classification
INDEX_FROM = 3
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features, index_from=INDEX_FROM)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

Loading data...
25000 train sequences
25000 test sequences


In [5]:
# https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
# Reading raw text
word_to_id = imdb.get_word_index()

def get_raw_txt(word_id_dict, input_data):
    word_id_dict = {k:(v+INDEX_FROM) for k,v in word_id_dict.items()}
    word_id_dict["<PAD>"] = 0
    word_id_dict["<START>"] = 1
    word_id_dict["<UNK>"] = 2
    id_to_word = {value:key for key,value in word_id_dict.items()}
    return ' '.join(id_to_word[_id] for _id in input_data)

r_t = get_raw_txt(word_to_id, x_train[20])
print(r_t + "\n")
print("Length: {}".format(len(r_t.split(' '))))

<START> shown in australia as <UNK> this incredibly bad movie is so bad that you become <UNK> and have to watch it to the end just to see if it could get any worse and it does the storyline is so predictable it seems written by a high school dramatics class the sets are pathetic but marginally better than the <UNK> and the acting is wooden br br the infant <UNK> seems to have been stolen from the props cupboard of <UNK> <UNK> there didn't seem to be a single original idea in the whole movie br br i found this movie to be so bad that i laughed most of the way through br br malcolm mcdowell should hang his head in shame he obviously needed the money

Length: 129


In [6]:
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Pad sequences (samples x time)
x_train shape: (25000, 80)
x_test shape: (25000, 80)


In [7]:
r_t_r = get_raw_txt(word_to_id, x_train[20])
print(r_t_r + "\n")
print("Length: {}".format(len(r_t_r.split(' '))))

dramatics class the sets are pathetic but marginally better than the <UNK> and the acting is wooden br br the infant <UNK> seems to have been stolen from the props cupboard of <UNK> <UNK> there didn't seem to be a single original idea in the whole movie br br i found this movie to be so bad that i laughed most of the way through br br malcolm mcdowell should hang his head in shame he obviously needed the money

Length: 80


### Build an LSTM model using word-embeddings

In [16]:
print('Build a model...')
model = Sequential()
model.add(Embedding(input_dim=max_features,
                    output_dim=128,
                    input_length=maxlen))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1))
model.add(Activation('sigmoid'))

Build a model...


In [17]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

### Train a model

In [18]:
print('Train...')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=n_epoch,
          validation_data=(x_test, y_test))

Train...
Train on 25000 samples, validate on 25000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f6be6141080>

### Persist the model for future use

In [19]:
# Save and persist the trained keras model in YAML format
model_yaml = model.to_yaml()
with open("model_lstm_{}.yaml".format(n_epoch), "w") as yaml_file:
    yaml_file.write(model_yaml)
# serialize weights to HDF5
model.save_weights("model_lstm_{}.h5".format(n_epoch))
print("Saved model to disk")

Saved model to disk


### Load the saved model

In [20]:
# load the model
K.set_learning_phase(0)
yaml_file = open('model_lstm_{}.yaml'.format(n_epoch), 'r')
loaded_model_yaml = yaml_file.read()
yaml_file.close()
loaded_model = model_from_yaml(loaded_model_yaml)
# load weights into new model
loaded_model.load_weights("model_lstm_{}.h5".format(n_epoch))
print("Loaded model from disk")

Loaded model from disk


#### Summarize the Model

In [21]:
loaded_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 80, 128)           2560000   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 80, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
_________________________________________________________________
activation_3 (Activation)    (None, 1)                 0         
Total params: 2,691,713
Trainable params: 2,691,713
Non-trainable params: 0
_________________________________________________________________


### Evaluating the model's performance ( e.g. Accuracy )

In [24]:
# Compute train and test accuracy using cross entropy as the cost function
loaded_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
score_test, acc_test = loaded_model.evaluate(x_test, y_test,
                            batch_size=batch_size)

score_train, acc_train = loaded_model.evaluate(x_train, y_train,
                            batch_size=batch_size)
print("\n\n")
print('Train score:', score_train)
print('Train accuracy:', acc_train)
print("\n")
print('Test score:', score_test)
print('Test accuracy:', acc_test)



Train score: 0.1477368958902359
Train accuracy: 0.95164


Test score: 0.391812992708683
Test accuracy: 0.83748


### Lets ask Skater to help us in interpreting the model

In [27]:
K.set_learning_phase(0)
with DeepInterpreter(session=K.get_session()) as di:
    print("learning_phase {}".format(K.learning_phase()))
    yaml_file = open('model_lstm_3.yaml', 'r')
    loaded_model_yaml = yaml_file.read()
    yaml_file.close()
    
    loaded_model = model_from_yaml(loaded_model_yaml)
    # load weights into new model
    loaded_model.load_weights("model_lstm_3.h5")
    print("Load model from disk")    
    
    # Input data
    xs = np.array([x_test[1]])
    ys = np.array([y_test[1]])

    print('Predicted class : {}'.format(loaded_model.predict_classes(np.array([x_test[1]]))))
    print('Ground Truth: {}'.format(ys))
    
    embedding_tensor = loaded_model.layers[0].output
    input_tensor = loaded_model.layers[0].input
    
    embedding_out = di.session.run(embedding_tensor, {input_tensor: xs});
    # Using Integrated Gradient for computing feature relevance
    relevance_scores = di.explain('ig', loaded_model.layers[-2].output * ys, 
                                  loaded_model.layers[1].input, embedding_out, use_case='txt');

learning_phase 0
Load model from disk

2018-06-12 05:33:16,184 - IntegratedGradients - INFO - Executing operations to compute relevance using Integrated Gradient



Predicted class : [[1]]
Ground Truth: [1]


In [28]:
# Retrieve the text
r_t = get_raw_txt(word_to_id, x_test[1])
print(r_t)

as he spouts the one liners out i also like the scenes with <UNK> at the beginning find her very sexy when she's wearing all that fetish gear i can't be the only one surely i personally think bride of chucky is a fantastic film total entertainment from start to finish great humour horror in equal measure at only 85 minutes long it never becomes boring or dull a personal favourite of mine watch it as soon as you can


In [29]:
# building a dataframe with columns 'features' and 'relevance scores'
# Since, the relevance score is compute over the embedding vector, we aggregate it by computing 'mean'
# over the embedding to get scalar coefficient for the features
relevance_scores_df = pd.DataFrame(relevance_scores[0]).mean(axis=1)
relevance_scores_df.describe()

count    80.000000
mean      0.000108
std       0.002709
min      -0.011811
25%      -0.000399
50%       0.000355
75%       0.001011
max       0.007102
dtype: float64

#### Visualize the results

In [35]:
build_visual_explainer(r_t, relevance_scores_df, highlight_oov=True, file_name="rendered",
                       title="GroundTruth: {}\n".format(y_test[1]), enable_plot=True)

2018-06-12 05:36:20,480 - skater.core.visualizer.text_relevance_visualizer - INFO - Rank order feature relevance based on input created and saved as feature_relevance.png
2018-06-12 05:36:20,481 - skater.core.visualizer.text_relevance_visualizer - INFO - Relevance plot name: feature_relevance.png
2018-06-12 05:36:20,502 - skater.core.visualizer.text_relevance_visualizer - INFO - Visual Explainer built, use show_in_notebook to render in Jupyter style Notebooks: rendered.html


In [36]:
show_in_notebook('./rendered.html')

2018-06-12 05:36:21,624 - skater.util.dataops - INFO - File Name: ./rendered.html
