### Exploring NLP Models with Skater

In this example, we'll train a couple types of models, and use Skater, LIME, and ipywidgets to interactively explore model behavior.

### Install Deps

In [1]:
!sudo pip install --upgrade np_utils
!sudo pip install --upgrade theano
!sudo pip install --upgrade tensorflow
!sudo pip install keras==2.0.6
!sudo pip install spacy

Collecting np_utils
  Downloading np_utils-0.5.3.4.tar.gz (56kB)
[K    100% |################################| 61kB 3.2MB/s ta 0:00:011
[?25hRequirement already up-to-date: numpy>=1.0 in /usr/local/lib/python2.7/dist-packages (from np_utils)
Collecting future>=0.16 (from np_utils)
  Downloading future-0.16.0.tar.gz (824kB)
[K    100% |################################| 829kB 1.5MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: np-utils, future
  Running setup.py bdist_wheel for np-utils ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/5d/be/b5/e223535ec3efb733df6afc2518d90d398bbe759e665683b025
  Running setup.py bdist_wheel for future ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/c2/50/7c/0d83b4baac4f63ff7a765bd16390d2ab43c93587fac9d6017a
Successfully built np-utils future
Installing collected packages: future, np-utils
  Found existing installation: future 0.15.2
    Uninstalling future-0.15.2:
      Successfully uninstalled futu

  Found existing installation: tensorflow 1.1.0
    Uninstalling tensorflow-1.1.0:
      Successfully uninstalled tensorflow-1.1.0
  Found existing installation: pbr 3.0.1
    Uninstalling pbr-3.0.1:
      Successfully uninstalled pbr-3.0.1
Successfully installed backports.weakref-1.0.post1 bleach-1.5.0 html5lib-0.9999999 markdown-2.6.9 pbr-3.1.1 protobuf-3.4.0 tensorflow-1.3.0 tensorflow-tensorboard-0.1.8 wheel-0.30.0
Collecting keras==2.0.6
  Downloading Keras-2.0.6.tar.gz (228kB)
[K    100% |################################| 235kB 3.1MB/s ta 0:00:01
Building wheels for collected packages: keras
  Running setup.py bdist_wheel for keras ... [?25ldone
[?25h  Stored in directory: /root/.cache/pip/wheels/c2/80/ba/2beab8c2131e2dcc391ee8a2f55e648af66348115c245e0839
Successfully built keras
Installing collected packages: keras
  Found existing installation: Keras 2.0.8
    Uninstalling Keras-2.0.8:
      Successfully uninstalled Keras-2.0.8
Successfully installed keras-2.0.6


In [3]:
### Restart kernel
from __future__ import absolute_import

!python -m spacy download en

/usr/bin/python: No module named spacy.__main__; 'spacy' is a package and cannot be directly executed


### Load SpaCy Language Model and Dataset

In [4]:
import spacy
import warnings
from sklearn.datasets import fetch_20newsgroups
import numpy as np
warnings.filterwarnings('ignore')
nlp = spacy.load('en')

from sklearn.model_selection import train_test_split
#gimme data
dataset = fetch_20newsgroups()
docs = dataset.data
y = dataset.target

docs_train, docs_test, y_train, y_test = train_test_split(docs, y, test_size = .3)

### Model 1: Pretrained Word Embeddings

We will use SpaCy's pretrained word embeddings as document representations, and feed these representations into a gradient boosting classifier.

In [22]:
#gimme vectors
from sklearn.preprocessing import FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from spacy.tokens.doc import Doc
from sklearn.metrics import classification_report
import six

def doc2vec(x):
    if isinstance(x, (six.binary_type, six.string_types)):
        return nlp(x, parse = False, entity = False, tag = False).vector
    
    elif type(x) in [list, tuple, np.ndarray]:
        return np.array([doc2vec(six.text_type(doc)) for doc in x])
    
    else:
        raise ValueError("Unrecognized Input") 

# build a pipeline of text -> vector (transformer), vector -> predictions (model)
model = GradientBoostingClassifier(n_estimators = 50)

transformer = FunctionTransformer(func = doc2vec, validate=False)
pipeline = make_pipeline(transformer, model)
pipeline.fit(docs_train, y_train)       

#Classification Report on Holdout
print(
    classification_report(y_test, 
                          pipeline.predict(docs_test), 
                          target_names=dataset.target_names)
)

                          precision    recall  f1-score   support

             alt.atheism       0.45      0.46      0.46       140
           comp.graphics       0.44      0.35      0.39       181
 comp.os.ms-windows.misc       0.39      0.49      0.44       163
comp.sys.ibm.pc.hardware       0.39      0.32      0.35       184
   comp.sys.mac.hardware       0.38      0.43      0.40       172
          comp.windows.x       0.50      0.53      0.51       186
            misc.forsale       0.64      0.67      0.65       182
               rec.autos       0.64      0.53      0.58       201
         rec.motorcycles       0.49      0.59      0.54       178
      rec.sport.baseball       0.63      0.64      0.63       171
        rec.sport.hockey       0.70      0.67      0.68       170
               sci.crypt       0.63      0.66      0.64       169
         sci.electronics       0.49      0.52      0.51       181
                 sci.med       0.76      0.72      0.74       177
         

### Model 2: CNN 
In this model, we convert text to a list of padded lists of word IDs, to be used in an embedding lookup table. The embeddings will be trained as part of a CNN implemented with Keras.

In [8]:

from sklearn.datasets import fetch_20newsgroups
from spacy.tokens import Doc
import spacy
from spacy.matcher import Matcher
from spacy.attrs import ORTH, IS_PUNCT
from collections import Counter
from functools import partial

class TextProcesser(object):
    def __init__(self, corpus, nlp=None, max_len=200, max_vocab_size=20000):
        """
        corpus: list of strings
            Documents used to initialize vocabulary.
            
        nlp: Spacy language model
            If none then will build one in __init__
            
        max_len: int
            Maximum length of a document sequence. Balance information with scale of data.
            
        max_vocab_size: int
        
        """
        self.max_vocab_size = max_vocab_size
        self.max_len = max_len
        self.nlp = nlp or spacy.load('en')
        self.PADDING_VAL = 1
        self.MISSING_VAL = 2
        self.START_VAL = 3
        self.END_VAL = 4
        self.vocab = {}
        self.vocab_counts = Counter()#Counter(['PADDING_VAL','MISSING_VAL','START_VAL','END_VAL'])
        self.build_vocab(corpus)

        
    def pad(self, obj):
        n_pads = max(self.max_len - len(obj) - 2, 0)
        we_can_take = self.max_len - 2
        result = [self.START_VAL] + obj[:we_can_take] + [self.END_VAL] + [self.PADDING_VAL] * n_pads
        return result
        
    def get_current_vocab_size(self):
        return len(self.vocab)
        
    def update(self, words):
        for word in words:
            self.vocab_counts.update([word])

    def build_vocab(self, corpus):
        self.vocab = {}
        self.vocab_counts = Counter()
        
        for doc in nlp.tokenizer.pipe(map(six.text_type, corpus)):
            self.update(map(self._process_token, doc))
            
        for i, (word, count) in enumerate(self.vocab_counts.most_common(self.max_vocab_size)):
            self.vocab[word] = i
        
    def _process_token(self, token):
        if token.is_space:
            return "SPACE"
        elif token.is_punct:
            return "PUNCT"       
        elif token.like_url:
            return "URL"
        elif token.like_email:
            return "EMAIL"
        elif token.like_num:
            return "NUM"
        else:
            return token.lower_

    def process_token(self, token):
        return self.vocab.get(self._process_token(token), self.MISSING_VAL)

    def process(self, texts):
        docs = []
        for doc in self.nlp.tokenizer.pipe(list(texts)):
            docs.append(self.pad(list(map(self.process_token, doc))))
        return np.array(docs)
            
    def __call__(self, texts):
        return self.process(texts)
    

In [9]:
#convolutional model: https://arxiv.org/abs/1408.5882
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout,  Input, Dense, Activation, Flatten
from keras.models import Sequential, Model, Sequential
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import TimeDistributed
from keras.layers.merge import Concatenate
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import label_binarize


def model_factory(seq_len, 
                  vocab_size, 
                  embedding_size, 
                  n_classes, 
                  model_type='sequential',
                  loss='categorical_crossentropy', 
                  metrics=['acc'], 
                  optimizer='rmsprop'):
    
    def create_sequential_model():
        model = Sequential()
        model.add(Embedding(vocab_size, embedding_size, input_length=seq_len))
        model.add(Conv1D(64, 3, strides=1, padding='valid'))
        model.add(MaxPooling1D(pool_size=2))
        model.add(GlobalMaxPooling1D())
        model.add(Activation('relu'))
        model.add(Dense(n_classes,  activation='softmax'))
        return model
        
    def create_non_sequential_model():
        _input = Input(shape=(seq_len,), dtype='int32')
        _embedding = Embedding(vocab_size, embedding_size, input_length=seq_len)(_input)

        # each filter is (3 x 300 ) array of weights
        # window (kernel_size) is 3
        # so number of weights is (3 * 300 * 64)
        # each filter outputs a (200 / strides) x 1 transformation
        # padding is how we handle boundaries. include + pad, ignore, etc
        _conv_1 = Conv1D(64, 3, strides=1, padding='valid')(_embedding)

        # Cuts the size of the output in half, maxing over every 2 inputs
        _pool_1 = MaxPooling1D(pool_size=2)(_conv_1)
        _conv_2 = Conv1D(64, 3, padding='valid')(_pool_1)
        _pool_2 = GlobalMaxPooling1D()(_conv_2) 
        _activation = Activation('relu')(_pool_2)
        output = Dense(n_classes,  activation='softmax')(_activation)
        model = Model(inputs=_input, outputs=output)
        return model
        

    def create_model():
        if model_type=='sequential':
            model = create_sequential_model()
        elif model_type == 'non-sequential':
            model = create_non_sequential_model()        
        else:
            raise ValueError("Unrecognized model type {}".format(model_type))

        model.compile(loss=loss,
                     optimizer=optimizer,
                     metrics=metrics)
        return model

    return create_model
    
seq_len = 350
vocab_size = 25000
embedding_size = 300
epochs = 8
batch_size = 100
n_classes = len(np.unique(y))

model_build = model_factory(seq_len, vocab_size, embedding_size, n_classes)
model2 = KerasClassifier(build_fn=model_build, epochs=epochs, batch_size=batch_size, verbose=1)
processor = FunctionTransformer(TextProcesser(docs_train, nlp=nlp, max_len=seq_len), validate=False)
pipeline2 = make_pipeline(processor, model2)

# need to one hot encode y labels
y2_train = label_binarize(y_train, classes=range(len(np.unique(dataset.target_names))))
pipeline2.fit(docs_train, y2_train)

# make model silent after training
params = model2.get_params()
params = {key: value for key, value in params.items() if key != 'build_fn'}
params['verbose'] = 0
model2.set_params(**params)

# Model Performance on Holdout
print(
    classification_report(y_test, 
                          pipeline2.predict(docs_test), 
                          target_names=dataset.target_names)
)

Using TensorFlow backend.


Couldn't import dot_parser, loading of dot files will not be possible.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
                          precision    recall  f1-score   support

             alt.atheism       0.95      0.88      0.91       140
           comp.graphics       0.77      0.76      0.76       181
 comp.os.ms-windows.misc       0.74      0.83      0.78       163
comp.sys.ibm.pc.hardware       0.70      0.75      0.72       184
   comp.sys.mac.hardware       0.84      0.81      0.83       172
          comp.windows.x       0.81      0.85      0.83       186
            misc.forsale       0.83      0.80      0.82       182
               rec.autos       0.88      0.82      0.85       201
         rec.motorcycles       0.90      0.85      0.87       178
      rec.sport.baseball       0.96      0.93      0.94       171
        rec.sport.hockey       0.98      0.95      0.96       170
               sci.crypt       0.96      0.93      0.95 

### Model explanations
Here, we'll wrap each pipeline into a Skater model object. We'll use this model object to generate LIME explanations in HTML to help better understand how each model makes predictions. We'll wrap this functionality into an ipywidget to allow the user the (a) modify the text and (b) toggle between models.

In [18]:
## You may need to enable ipywidgets
! pip install ipywidgets
!jupyter nbextension enable --py --user widgetsnbextension
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

Collisions detected in /home/deploy/.jupyter/jupyter_notebook_config.py and /home/deploy/.jupyter/jupyter_notebook_config.json config files. /home/deploy/.jupyter/jupyter_notebook_config.json has higher priority: {
      "NotebookApp": {
        "password": "u'sha1:5dc72084d9fd:d561b0390f93dd2181c7b8cbcf5019eeb710939a' ignored, using u'sha1:f7db36e89abc:5026bfe0dc36d0cafb16e8b814f9d1248ec8bbd6'"
      }
    }
Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m
Collisions detected in /home/deploy/.jupyter/jupyter_notebook_config.py and /home/deploy/.jupyter/jupyter_notebook_config.json config files. /home/deploy/.jupyter/jupyter_notebook_config.json has higher priority: {
      "NotebookApp": {
        "password": "u'sha1:5dc72084d9fd:d561b0390f93dd2181c7b8cbcf5019eeb710939a' ignored, using u'sha1:f7db36e89abc:5026bfe0dc36d0cafb16e8b814f9d1248ec8bbd6'"
      }
    }
Enabling notebook extension jupyter-js-widgets/extension...
Traceback (most recent

In [7]:
!sudo pip install skater

Collecting skater
  Downloading skater-1.0.2.tar.gz
Collecting ds-lime>=0.1.1.21 (from skater)
  Downloading ds-lime-0.1.1.27.tar.gz (253kB)
[K    100% |################################| 256kB 3.0MB/s ta 0:00:01
Collecting pathos==0.2.0 (from skater)
  Downloading pathos-0.2.0.tgz (68kB)
[K    100% |################################| 71kB 4.4MB/s ta 0:00:011
Collecting ppft>=1.6.4.5 (from pathos==0.2.0->skater)
  Downloading ppft-1.6.4.7.1.zip (78kB)
[K    100% |################################| 81kB 4.7MB/s ta 0:00:011
[?25hCollecting pox>=0.2.2 (from pathos==0.2.0->skater)
  Downloading pox-0.2.3.zip (41kB)
[K    100% |################################| 51kB 8.7MB/s eta 0:00:01
[?25hCollecting multiprocess>=0.70.4 (from pathos==0.2.0->skater)
  Downloading multiprocess-0.70.5.zip (1.5MB)
[K    100% |################################| 1.5MB 872kB/s eta 0:00:01
Building wheels for collected packages: skater, ds-lime, pathos, ppft, pox, multiprocess
  Running setup.py bdist_wheel fo

In [20]:
#Create the explorer app.
from warnings import filterwarnings
filterwarnings('ignore')
from ipywidgets import Button, Textarea, Layout, Box, Label, Text, Output, RadioButtons, HBox
from IPython.display import display, HTML, clear_output
from skater.core.local_interpretation.lime.lime_text import LimeTextExplainer

class TextExplainer(object):
    def __init__(self, models, init_pattern=""):
        """
        Display box for LIME results.
        
        models: dictionary of skater of models.
            Keys correspond to user-defined model names, used for radio buttons.
            Values are skater models used to generate predictions.
    
        """
        self.status = "Ready"
        self.explainer = LimeTextExplainer(class_names=dataset.target_names)
        self.models = models
        self.model_names = list(self.models.keys())
        self.text_field = Textarea(init_pattern, layout=Layout(height='200px', width='500px'))
        self.text_box = Box([Label(value='Text Box'), self.text_field])
        
        self.status_field = Label(self.status, layout=Layout(height='50px', width='100px'))        
        self.status_box = Box([Label(value='Status'), self.status_field])

        self.match_button = Button(description='Explain', )
        self.match_button.on_click(self.match_pattern)
        
        self.model_selectors = RadioButtons(
            options = self.model_names,
            description = "Use Model"
        )
        
        self.inputs_box = HBox([self.text_box, self.model_selectors])        
        
        self.explanation_area = Output()
        display(self.inputs_box)       
        display(self.match_button)
        display(self.status_box)
        display(self.explanation_area)
        
    @property
    def model(self):
        return self.models[self.model_selectors.value]
            
    @property
    def text(self):
        return self.text_field.value    
    
    def match_pattern(self, b):
        self.status_field.value = 'loading'
        with self.explanation_area:
            clear_output()
            display(HTML(self.get_explanation_as_html(self.text)))
        self.status_field.value = 'ready'

    def get_explanation_as_html(self, text):
        
        # generate most likely class to confine LIME results

        explanation = self.explainer.explain_instance(text, 
                                                      self.model, 
                                                      top_labels=1)

        return explanation.as_html()

In [21]:
models = {"CNN": pipeline2.predict_proba, ' GBC-Pretrain': pipeline.predict_proba}
r = TextExplainer(models, docs_test[3])

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"
