In [1]:
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout 
from tensorflow.keras.layers import MaxPooling1D, Flatten, Activation
from tensorflow.keras.utils import to_categorical
from alibi.explainers import IntegratedGradients
from captum.attr import LayerIntegratedGradients, TokenReferenceBase
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # True

TF version:  2.3.0
Eager execution enabled:  True


In [2]:
max_features = 10000
maxlen = 100

In [3]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
test_labels = y_test.copy()
train_labels = y_train.copy()
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
y_train, y_test = to_categorical(y_train), to_categorical(y_test)

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

index = imdb.get_word_index()
reverse_index = {value: key for (key, value) in index.items()} 

Loading data...


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray


25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)


In [4]:
def decode_sentence(x, reverse_index):
    # the `-3` offset is due to the special tokens used by keras
    # see https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
    return " ".join([reverse_index.get(i - 3, 'UNK') for i in x])

In [5]:
print(decode_sentence(x_test[1], reverse_index)) 

a powerful study of loneliness sexual UNK and desperation be patient UNK up the atmosphere and pay attention to the wonderfully written script br br i praise robert altman this is one of his many films that deals with unconventional fascinating subject matter this film is disturbing but it's sincere and it's sure to UNK a strong emotional response from the viewer if you want to see an unusual film some might even say bizarre this is worth the time br br unfortunately it's very difficult to find in video stores you may have to buy it off the internet


# Models

In [6]:
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250

In [7]:
device = torch.device("cpu")

In [8]:
class Net(nn.Module):
    def __init__(self):

        super(Net, self).__init__()

        self.emb = nn.Embedding(max_features,
                               embedding_dims)
        self.linear1 = nn.Linear(5000, hidden_dims)
        self.linear2 = nn.Linear(hidden_dims, 2)

    def forward(self,x):
        x = self.emb(x)
        x = torch.flatten(x, 1)
        x = self.linear1(x)
        x = F.relu(x) # Adding relu layers makes the attributions different
        x = self.linear2(x)
        return x

In [9]:
    
inputs = Input(shape=(maxlen,), 
               dtype='int32', 
               name='inputs')
out = Embedding(max_features,
                embedding_dims, 
                name='emb')(inputs)
out = Flatten(name='Flat', 
              data_format='channels_last')(out)
out = Dense(hidden_dims, 
            name='linear1')(out)
out = Activation('relu')(out) 
# Adding relu layers makes the attributions different
out = Dense(2, 
            name='linear2')(out)
model = Model(inputs=inputs, outputs=out)

In [10]:
net = Net()
net.to(device)

Net(
  (emb): Embedding(10000, 50)
  (linear1): Linear(in_features=5000, out_features=250, bias=True)
  (linear2): Linear(in_features=250, out_features=2, bias=True)
)

# transfer weights

In [11]:
lnames = []
for name in net.state_dict().keys():
    lname = name.split('.')[0]
    if lname not in lnames:
        lnames.append(lname)

In [12]:
for name in lnames:

    if 'conv' in name:
        ws = net.state_dict()[name + '.weight'].cpu().numpy()
        ws = np.transpose(ws, (2, 1, 0))
        bs = net.state_dict()[name + '.bias'].cpu().numpy()
        l = model.get_layer(name)
        l.set_weights([ws, bs])
    elif 'linear' in name:
        ws = net.state_dict()[name + '.weight'].cpu().numpy()
        ws = ws.T
        bs = net.state_dict()[name + '.bias'].cpu().numpy()
        l = model.get_layer(name)
        l.set_weights([ws, bs])
    elif 'emb' in name:
        ws = net.state_dict()[name + '.weight'].cpu().numpy()
        l = model.get_layer(name)
        l.set_weights([ws])
    print(name, ws.shape) #  , bs.shape)

emb (10000, 50)
linear1 (5000, 250)
linear2 (250, 2)


In [13]:
weights_emb_pt = net.state_dict()['emb.weight']
weights_emb_tf = model.layers[1].get_weights()

In [14]:
np.allclose(weights_emb_pt, weights_emb_tf, rtol=1e-03)

True

In [31]:
nb_samples = 10
torch_X_test = torch.from_numpy(x_test)
torch_y_test = torch.from_numpy(y_test)
x_test_sample = torch_X_test[nb_samples:nb_samples + 10]

In [32]:
model(x_test_sample.numpy())

<tf.Tensor: shape=(10, 2), dtype=float32, numpy=
array([[ 0.03181124,  0.01722053],
       [ 0.051693  ,  0.13679706],
       [ 0.1720075 ,  0.2850133 ],
       [ 0.16410483,  0.47410205],
       [ 0.12408741,  0.5561405 ],
       [-0.14426398,  0.24505275],
       [-0.13632366,  0.3933563 ],
       [-0.04491811,  0.35326967],
       [-0.40871558,  0.09026024],
       [-0.00379803,  0.17383164]], dtype=float32)>

In [33]:
net(x_test_sample.to(device))

tensor([[ 0.0318,  0.0172],
        [ 0.0517,  0.1368],
        [ 0.1720,  0.2850],
        [ 0.1641,  0.4741],
        [ 0.1241,  0.5561],
        [-0.1443,  0.2451],
        [-0.1363,  0.3934],
        [-0.0449,  0.3533],
        [-0.4087,  0.0903],
        [-0.0038,  0.1738]], grad_fn=<AddmmBackward>)

# Intgrads comparison

In [34]:
n_steps = 50
method = "gausslegendre"
internal_batch_size = 100

In [35]:
def interpret_sentence(model, indexed, min_len = 100, label = 1):

    input_indices = indexed.to(device)
    seq_length = min_len

    # predict
    pred = net.forward(input_indices)

    # generate reference indices for each sample
    reference_indices = torch.tensor(np.zeros(input_indices.shape), dtype=int).to(device)

    # compute attributions and approximation delta using layer integrated gradients
    attributions_ig, delta = lig.attribute(input_indices, 
                                           reference_indices, 
                                           target=label,
                                           method=method,
                                           n_steps=50, 
                                           return_convergence_delta=True,
                                          attribute_to_layer_input=True)
    
    return attributions_ig, delta, reference_indices.numpy()

In [36]:
lig = LayerIntegratedGradients(net, net.linear1)

In [37]:
token_reference = TokenReferenceBase(reference_token_idx=0)
# For simplicity, we compute the attribution relative to label = 1 for all samples
attributions_pt, delta, reference_indices = interpret_sentence(net, 
                                                               x_test_sample, 
                                                               label=1)
attributions_pt = attributions_pt.numpy()
print('Attributions shape:', attributions_pt.shape)

Attributions shape: (10, 5000)


In [38]:
layer = model.layers[3]
layer

<tensorflow.python.keras.layers.core.Dense at 0x7fa2d9b98c50>

In [39]:
ig  = IntegratedGradients(model,
                          layer=layer,
                          n_steps=n_steps, 
                          method=method,
                          internal_batch_size=internal_batch_size)

In [40]:
x_test_sample = x_test_sample.numpy()
#predictions = model(x_test_sample).numpy().argmax(axis=1)
explanation = ig.explain(x_test_sample, 
                         baselines=reference_indices, 
                         target=1,
                        compute_layer_inputs_gradients=True)
# Get attributions values from the explanation object
attributions_tf = explanation.attributions[0]
print('Attributions shape:', attributions_tf.shape)

Attributions shape: (10, 5000)


### Compare

In [41]:
np.allclose(attributions_tf, attributions_pt, atol=1e-07)

True

In [42]:
attributions_pt

array([[-4.21469571e-03, -1.62378047e-03,  3.91519864e-03, ...,
         4.39232335e-05,  2.64565624e-03, -1.53950182e-03],
       [-1.35421988e-03,  1.93412980e-03,  4.48370843e-03, ...,
        -2.73249235e-04,  2.34585438e-03,  2.02211036e-03],
       [-4.58304672e-03, -3.43286137e-03,  1.30007402e-03, ...,
        -2.89742109e-03, -2.24289863e-04, -4.13060519e-03],
       ...,
       [ 7.79471960e-04, -6.50702091e-03, -3.36324445e-05, ...,
        -1.22888147e-03,  9.23896788e-04, -3.11811411e-04],
       [ 2.74067355e-03, -1.35195815e-03, -9.73411199e-05, ...,
        -3.67314182e-04,  6.62322877e-04, -2.21013898e-03],
       [-3.47013454e-04, -1.47317675e-03, -6.65119712e-05, ...,
         1.34957540e-03,  4.65485775e-03,  3.84821680e-03]])

In [43]:
attributions_tf

array([[-4.2146957e-03, -1.6237805e-03,  3.9151986e-03, ...,
         4.3923235e-05,  2.6456562e-03, -1.5395018e-03],
       [-1.3542199e-03,  1.9341299e-03,  4.4837082e-03, ...,
        -2.7324923e-04,  2.3458544e-03,  2.0221104e-03],
       [-4.5830468e-03, -3.4328613e-03,  1.3000739e-03, ...,
        -2.8974211e-03, -2.2428986e-04, -4.1306051e-03],
       ...,
       [ 7.7947194e-04, -6.5070209e-03, -3.3632445e-05, ...,
        -1.2288815e-03,  9.2389685e-04, -3.1181140e-04],
       [ 2.7406735e-03, -1.3519581e-03, -9.7341122e-05, ...,
        -3.6731418e-04,  6.6232285e-04, -2.2101388e-03],
       [-3.4701344e-04, -1.4731766e-03, -6.6511966e-05, ...,
         1.3495754e-03,  4.6548578e-03,  3.8482167e-03]], dtype=float32)

In [28]:
np.allclose(delta, explanation.deltas, atol=1e-6)

True

In [29]:
delta

tensor([-4.5890e-03, -4.4222e-04,  3.8949e-03,  2.5060e-03, -2.2662e-03,
         2.7603e-03, -1.1770e-03,  1.7273e-03, -2.3336e-06,  8.0885e-03],
       dtype=torch.float64)

In [30]:
explanation.deltas

array([-4.58897650e-03, -4.42281365e-04,  3.89492512e-03,  2.50585377e-03,
       -2.26621330e-03,  2.76022404e-03, -1.17677450e-03,  1.72734261e-03,
       -2.22399831e-06,  8.08829069e-03])

# Transformers

In [3]:
import tensorflow as tf
import numpy as np
import os
import pandas as pd
import re
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Conv1D, GlobalMaxPooling1D, Dropout 
from tensorflow.keras.utils import to_categorical

from transformers import BertTokenizerFast, TFBertModel, BertConfig
from transformers import DistilBertTokenizerFast
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from alibi.explainers import IntegratedGradients
import matplotlib.pyplot as plt
print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # True

TF version:  2.3.0
Eager execution enabled:  True


In [4]:
def preprocess_reviews(reviews):
    
    REPLACE_NO_SPACE = re.compile("[.;:,!\'?\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")
    
    reviews = [REPLACE_NO_SPACE.sub("", line.lower()) for line in reviews]
    reviews = [REPLACE_WITH_SPACE.sub(" ", line) for line in reviews]
    
    return reviews

def process_sentences(sentence1, 
                      tokenizer, 
                      max_len, 
                      distill=False, 
                      add_special_tokens=True):

    if not distill:
        z = tokenizer(sentence1, 
                      add_special_tokens = add_special_tokens, 
                      padding = 'max_length', 
                      max_length = max_len, truncation = True,
                      return_token_type_ids=True, 
                      return_attention_mask = True,  
                      return_tensors = 'np')

        return [z['input_ids'], z['attention_mask']]
    elif distill:
        z = tokenizer(sentence1, 
                      truncation=True, 
                      padding=True)
        
        return z

def decode_sentence(x, reverse_index):
    # the `-3` offset is due to the special tokens used by keras
    # see https://stackoverflow.com/questions/42821330/restore-original-text-from-keras-s-imdb-dataset
    return " ".join([reverse_index.get(i - 3, 'UNK') for i in x])

In [5]:
max_features = 10000
max_len = 100
distill=False
add_special_tokens=False

In [6]:
print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
test_labels = y_test.copy()
train_labels = y_train.copy()
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
y_train, y_test = to_categorical(y_train), to_categorical(y_test)

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=max_len)
x_test = sequence.pad_sequences(x_test, maxlen=max_len)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

index = imdb.get_word_index()
reverse_index = {value: key for (key, value) in index.items()} 

Loading data...


Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray


25000 train sequences
25000 test sequences
Pad sequences (samples x time)
x_train shape: (25000, 100)
x_test shape: (25000, 100)


In [7]:
X_train, X_test = [], []
for i in range(len(x_train)):
    tr_sentence = decode_sentence(x_train[i], reverse_index)
    X_train.append(tr_sentence)
    te_sentence = decode_sentence(x_test[i], reverse_index)
    X_test.append(te_sentence)

In [8]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [9]:
X_train = preprocess_reviews(X_train)
X_train = process_sentences(X_train, 
                                   tokenizer, 
                                   max_len,                                    
                                   distill=distill, 
                                   add_special_tokens=add_special_tokens)
X_test = preprocess_reviews(X_test)
X_test = process_sentences(X_test, 
                                   tokenizer, 
                                   max_len,                                    
                                   distill=distill, 
                                   add_special_tokens=add_special_tokens)

In [10]:
config = BertConfig.from_pretrained("distilbert-base-uncased", 
                                    output_hidden_states=False)
modelBert = TFBertModel.from_pretrained("distilbert-base-uncased",
                                        config=config)

modelBert.trainable=False

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFBertModel: ['activation_13', 'vocab_projector', 'distilbert', 'vocab_layer_norm', 'vocab_transform']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFBertModel were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['bert']
You should probably TRAIN this model on a down-stre

In [11]:
inp =  [X_test[0][:2], X_test[1][:2]]
input_shape = [x.shape[1:] for x in inp]
input_type = [x.dtype for x in inp]

# Bert 
Without a wrapper, IG won't work for bert models because of the customized bert output object.

In [89]:
n_steps = 5
method = "gausslegendre"
internal_batch_size = 100

In [90]:
layer = modelBert.bert.encoder.layer[3]
layer

<transformers.models.bert.modeling_tf_bert.TFBertLayer at 0x7fdb42f43310>

In [80]:
ig  = IntegratedGradients(modelBert,
                          layer=layer,
                          n_steps=n_steps, 
                          method=method,
                          internal_batch_size=10)

In [81]:
explanation = ig.explain(inp, 
                         baselines=None, 
                         target=1)
# Get attributions values from the explanation object
attributions_tf = explanation.attributions[0]
print('Attributions shape:', attributions_tf.shape)

[<tf.Tensor: shape=(10, 100, 768), dtype=float32, numpy=
array([[[ 2.0742157 , -0.8950368 ,  1.8933085 , ...,  0.8202039 ,
         -1.897774  ,  2.402349  ],
        [ 1.524005  , -0.13682622,  1.255195  , ...,  0.81792676,
         -0.6837781 ,  1.349809  ],
        [ 1.5790133 , -0.33281478,  1.3452189 , ..., -0.7252396 ,
         -0.6827304 ,  0.12341752],
        ...,
        [ 1.8804123 , -0.98103607,  0.86389995, ...,  0.10786448,
         -1.0245299 ,  1.9350705 ],
        [ 2.159008  , -0.1955547 ,  2.0331736 , ...,  0.689219  ,
         -0.45227218,  0.3811051 ],
        [ 1.8842316 , -1.4659264 ,  2.5653646 , ..., -0.797865  ,
         -0.98540187,  1.081907  ]],

       [[ 2.071913  , -0.8583547 ,  1.8113096 , ...,  0.79668903,
         -1.8640118 ,  2.389125  ],
        [ 1.5532464 , -0.12048204,  1.2645011 , ...,  0.83791834,
         -0.6803481 ,  1.3865451 ],
        [ 1.5945048 , -0.27372402,  1.2716355 , ..., -0.7648175 ,
         -0.6778127 ,  0.08656603],
        ..

TypeError: '>' not supported between instances of 'tuple' and 'int'

# Bert Wrapper
Using a wrapper such that the output is a tensor, IG should work

In [12]:
class BertWrapper(tf.keras.Model):

    def __init__(self, 
                modelBert):
        super(BertWrapper, self).__init__()
        self.bert_layer = modelBert
        
    def call(self, x):
        return self.bert_layer(x)[0]

In [13]:
bert_wrapper = BertWrapper(modelBert)

In [14]:
n_steps = 5
method = "gausslegendre"
internal_batch_size = 100

In [15]:
layer = bert_wrapper.layers[0].bert.encoder.layer[3]
layer

<transformers.models.bert.modeling_tf_bert.TFBertLayer at 0x7f95a642ac90>

In [16]:
ig  = IntegratedGradients(bert_wrapper,
                          layer=layer,
                          n_steps=n_steps, 
                          method=method,
                          internal_batch_size=10)

In [17]:
explanation = ig.explain(inp, 
                         baselines=None, 
                         target=1,
                        compute_layer_inputs_gradients=True)
# Get attributions values from the explanation object
attributions_tf = explanation.attributions[0]
print('Attributions shape:', attributions_tf.shape)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Attributions shape: (2, 100, 768)


In [18]:
attributions_tf

array([[[-5.3308293e-04, -5.6819106e-04,  1.3113442e-04, ...,
         -6.6759887e-05,  7.3145190e-04,  1.0570344e-03],
        [-1.6599990e-01,  8.5477479e-02,  3.1553423e-01, ...,
          2.1606695e-02, -4.2951778e-03, -7.0996985e-02],
        [ 6.8114678e-05, -7.7692693e-04, -1.0643669e-04, ...,
         -1.3060689e-04,  1.8519655e-04,  9.0612390e-04],
        ...,
        [-1.6729332e-06,  8.8790970e-05,  2.9183851e-04, ...,
         -2.7614128e-04,  3.5111469e-04,  2.8110386e-05],
        [ 5.1933084e-06, -3.7488635e-04, -6.3839936e-05, ...,
          8.2263956e-04, -1.1921322e-04,  5.1259394e-05],
        [-1.4389029e-05, -1.1729125e-03, -6.6376990e-04, ...,
         -6.6033425e-04,  1.3904712e-05,  1.3005122e-03]],

       [[-9.0011854e-05,  7.5423100e-05,  5.5642970e-05, ...,
          3.6419783e-04, -2.0745641e-05,  1.1699090e-03],
        [-4.7081542e-01,  6.3265628e-01, -3.5147730e-01, ...,
          1.1277145e-02, -9.5583908e-02, -6.0068276e-02],
        [ 5.7949310e-06, 

# ModelOut

In [61]:
nb_filters=64
dropout_1=0.4
dropout_2=0.
hidden_dims=128
batch_size = 128
epochs = 20
skip_conv=True

In [62]:
class ModelOut(tf.keras.Model):

    def __init__(self, 
                 nb_filters=32,
                 dropout_1=0.2,
                 dropout_2=0.2, 
                 hidden_dims=32,
                skip_conv=True):
        super(ModelOut, self).__init__()
        
        self.nb_filters = nb_filters
        self.dropout_1 = dropout_1
        self.dropout_2 = dropout_2
        self.hidden_dims = hidden_dims
        self.skip_conv = skip_conv
        
        if not self.skip_conv:
            self.conv = tf.keras.layers.Conv1D(nb_filters, 
                                               kernel_size=3, 
                                               padding="valid", 
                                               strides=1)
            self.dropoutl_1 = tf.keras.layers.Dropout(dropout_1)
            self.maxpool = tf.keras.layers.GlobalMaxPool1D()
        else:
            self.flat = tf.keras.layers.Flatten()
            
        #self.dense_1 =  tf.keras.layers.Dense(hidden_dims, 
        #                                      activation='relu')
        #self.dropoutl_2 = tf.keras.layers.Dropout(dropout_2)
        self.dense_2 = tf.keras.layers.Dense(2, 
                                             activation='softmax')

    def call(self, inputs):
        if not self.skip_conv:
            x = self.conv(inputs)
            x = self.dropoutl_1(x)
            x = self.maxpool(x)
        else:
            x = self.flat(inputs)
            
        #x = self.dense_1(x)
        #x = self.dropoutl_2(x)
        x = self.dense_2(x)
        return x
    
    def get_config(self):
        return {"nb_filters": self.nb_filters,
                "dropout_1": self.dropout_1,
                "dropout_2": self.dropout_2, 
                "hidden_dims": self.hidden_dims,
               "skip_conv": self.skip_conv}

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [63]:
model_out = ModelOut(nb_filters=nb_filters,
                 dropout_1=dropout_1,
                 dropout_2=dropout_2, 
                 hidden_dims=hidden_dims,
                    skip_conv=skip_conv)

In [64]:
input_ids_in = tf.keras.layers.Input(shape=(max_len,), 
                                     name='input_ids', 
                                     dtype=tf.int32)
attention_masks_in = tf.keras.layers.Input(shape=(max_len,), 
                                           name='attention_mask', 
                                           dtype=tf.int32)
X = modelBert([input_ids_in, attention_masks_in])[0]
X = model_out(X)
frozenModelOut = tf.keras.Model(inputs=[input_ids_in, 
                                        attention_masks_in], 
                                outputs=X)



In [65]:
frozenModelOut(inp)

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.4026964 , 0.59730357],
       [0.26494804, 0.73505193]], dtype=float32)>

In [66]:
n_steps = 5
method = "gausslegendre"
internal_batch_size = 100

In [67]:
layer = frozenModelOut.layers[2].bert.encoder.layer[1]
layer

<transformers.models.bert.modeling_tf_bert.TFBertLayer at 0x7fa2d4229fd0>

In [68]:
ig  = IntegratedGradients(frozenModelOut,
                          layer=layer,
                          n_steps=n_steps, 
                          method=method,
                          internal_batch_size=10)

In [69]:
predictions = frozenModelOut(inp).numpy().argmax(axis=1)
predictions

array([1, 1])

In [72]:

explanation = ig.explain(inp, 
                         baselines=None, 
                         target=predictions,
                        compute_layer_inputs_gradients=True)
# Get attributions values from the explanation object
attributions_tf = explanation.attributions[0]
print('Attributions shape:', attributions_tf.shape)

Attributions shape: (2, 100, 768)


In [73]:
attributions_tf

array([[[-9.9543695e-07, -2.2059334e-05, -1.0523251e-03, ...,
         -1.2000428e-04, -7.1018934e-04,  1.5115821e-04],
        [-3.8008086e-04,  4.9328115e-05, -1.2376311e-05, ...,
         -1.3036310e-05,  3.9209600e-04,  4.6793887e-04],
        [-2.6385476e-06,  4.7545458e-05,  7.4247940e-04, ...,
          1.7666531e-04, -4.4124055e-04, -6.2729610e-04],
        ...,
        [ 9.0255175e-04,  3.5623176e-05, -6.8455399e-04, ...,
          3.5659878e-03, -3.7591232e-04, -3.7780768e-04],
        [-2.3484734e-04, -1.1101482e-05, -5.5904465e-04, ...,
         -1.1464040e-03, -1.8696446e-03,  3.2213450e-04],
        [ 8.5562188e-04,  4.8604206e-04, -1.7912698e-04, ...,
          1.1792754e-04,  5.0187798e-04,  2.1630540e-04]],

       [[ 7.9523292e-05,  4.6781803e-04, -1.0212285e-03, ...,
         -5.7124951e-05,  4.9771642e-04, -2.4902020e-05],
        [ 8.1456332e-05,  1.2103335e-05, -2.4234883e-05, ...,
         -4.2496040e-04, -3.9054572e-05,  6.4270105e-04],
        [ 8.9010333e-05, 

# It gets messy below. 

# Gradients

In [37]:
from typing import Union, Callable, List

In [38]:
x_test_sample = x_test_sample.numpy()

In [39]:
target = [1 for _ in range(len(x_test_sample))]

In [40]:
orig_dummy_input = np.zeros((1,) + x_test_sample.shape[1:], dtype=x_test_sample.dtype)

In [55]:
def _gradients_layer(model: Union[tf.keras.models.Model],
                     layer: Union[tf.keras.layers.Layer],
                     orig_call: Callable,
                     orig_dummy_input: Union[list, np.ndarray],
                     x: tf.Tensor,
                     target: Union[None, tf.Tensor],
                    gradients_layer_inputs = False) -> tf.Tensor:
    """
    Calculates the gradients of the target class output (or the output if the output dimension is equal to 1)
    with respect to each element of `layer`.

    Parameters
    ----------
    model
        Tensorflow or keras model.
    layer
        Layer of the model with respect to which the gradients are calculated.
    orig_call
        Original `call` method of the layer. This is necessary since the call method is modified by the function
        in order to make the layer output visible to the GradientTape.
    x
        Input data point.
    target
        Target for which the gradients are calculated if the output dimension is higher than 1.

    Returns
    -------
        Gradients for each element of layer.

    """

    def watch_layer(layer, tape):
        """
        Make an intermediate hidden `layer` watchable by the `tape`.
        After calling this function, you can obtain the gradient with
        respect to the output of the `layer` by calling:

            grads = tape.gradient(..., layer.result)

        """

        def decorator(func):
            def wrapper(*args, **kwargs):
                # Store the result and the input of `layer.call` internally.
                if gradients_layer_inputs:
                    layer.inp = x
                    layer.result = func(x, **kwargs)
                else:
                    layer.inp = args
                    layer.result = x
                # From this point onwards, watch this tensors.
                tape.watch(layer.inp)
                tape.watch(layer.result)
                # Return the result to continue with the forward pass.
                return layer.result

            return wrapper

        layer.call = decorator(layer.call)
        return layer
    print(orig_dummy_input)
    with tf.GradientTape() as tape:
        watch_layer(layer, tape)
        preds = _run_forward(model, orig_dummy_input, target)
    
    if gradients_layer_inputs:
        grads = tape.gradient(preds, layer.inp)
    else:
        grads = tape.gradient(preds, layer.result)
    
    delattr(layer, 'inp')
    delattr(layer, 'result')
    layer.call = orig_call

    return grads

def _run_forward(model: Union[tf.keras.models.Model],
                 x: Union[List[tf.Tensor], List[np.ndarray]],
                 target: Union[None, tf.Tensor, np.ndarray, list]) -> tf.Tensor:
    """
    Returns the output of the model. If the target is not `None`, only the output for the selected target is returned.

    Parameters
    ----------
    model
        Tensorflow or keras model.
    x
        Input data point.
    target
        Target for which the gradients are calculated for classification models.

    Returns
    -------
        Model output or model output after target selection for classification models.

    """
    print(x)
    preds = model(x)
    if len(model.output_shape) > 1 and model.output_shape[-1] > 1:
        preds = _select_target(preds, target)

    return preds

def _select_target(ps, ts):
    if ts is not None:
        if isinstance(ps, tf.Tensor):
            ps = tf.linalg.diag_part(tf.gather(ps, ts, axis=1))
        else:
            raise NotImplementedError
    else:
        raise ValueError("target cannot be `None` if `model` output dimensions > 1")
    return ps


In [56]:
def _run_forward_to_layer(model: Union[tf.keras.models.Model, 'keras.models.Model'],
                            layer: Union[tf.keras.layers.Layer, 'keras.layers.Layer'],
                            orig_call: Callable,
                            x: tf.Tensor,
                         run_forward_to_layer_input: bool = False) -> tf.Tensor:

    def take_layer(layer):
        """
        Make an intermediate hidden `layer` watchable by the `tape`.
        After calling this function, you can obtain the gradient with
        respect to the output of the `layer` by calling:

            grads = tape.gradient(..., layer.result)

        """

        def decorator(func):
            def wrapper(*args, **kwargs):
                # Store the result of `layer.call` internally.
                layer.inp = args
                layer.result = func(*args, **kwargs)
                # Return the result to continue with the forward pass.
                return layer.result

            return wrapper

        layer.call = decorator(layer.call)
        return layer

    #inp = tf.zeros((x.shape[0], ) + model.input_shape[1:])
    take_layer(layer)
    _ = model(x)
    layer_inp = layer.inp
    layer_out = layer.result
    
    delattr(layer, 'inp')    
    delattr(layer, 'result')
    layer.call = orig_call
    
    if run_forward_to_layer_input:
        return layer_inp
    else:
        return layer_out

In [57]:
layer = model.layers[3]
layer


<tensorflow.python.keras.layers.core.Dense at 0x7f22d4433190>

In [58]:
orig_call = layer.call

In [59]:
X_layer_out = _run_forward_to_layer(model, layer, orig_call, x_test_sample, run_forward_to_layer_input=False)

In [60]:
X_layer_out

<tf.Tensor: shape=(10, 250), dtype=float32, numpy=
array([[-1.8045537 , -0.4056986 ,  0.02945525, ...,  0.5100677 ,
         1.0586026 ,  0.2051692 ],
       [-0.50673413, -0.67081714, -0.13903496, ...,  0.66713613,
        -0.33622786, -0.4687015 ],
       [-0.5948938 , -0.31598407, -0.5050943 , ..., -0.22554867,
         0.9042289 , -0.46439877],
       ...,
       [-0.7473105 ,  0.08260478, -0.8872134 , ..., -0.2511767 ,
        -0.05167651,  0.4660214 ],
       [-0.5327522 ,  0.592322  , -0.5168347 , ...,  0.3552242 ,
        -1.0856992 , -0.42254546],
       [ 0.20351203, -0.8673625 ,  0.66974527, ...,  0.22718716,
        -0.7815447 ,  0.5024527 ]], dtype=float32)>

In [61]:
grads_out = _gradients_layer(model, 
                             layer, 
                             orig_call, 
                             orig_dummy_input, 
                             X_layer_out, 
                             target, 
                             gradients_layer_inputs=False)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [62]:
grads_out

<tf.Tensor: shape=(10, 250), dtype=float32, numpy=
array([[-0.        ,  0.        ,  0.04052283, ..., -0.02394654,
         0.01421037, -0.05370912],
       [-0.        ,  0.        ,  0.        , ..., -0.02394654,
         0.        , -0.        ],
       [-0.        ,  0.        ,  0.        , ..., -0.        ,
         0.01421037, -0.        ],
       ...,
       [-0.        ,  0.05601896,  0.        , ..., -0.        ,
         0.        , -0.05370912],
       [-0.        ,  0.05601896,  0.        , ..., -0.02394654,
         0.        , -0.        ],
       [-0.06189159,  0.        ,  0.04052283, ..., -0.02394654,
         0.        , -0.05370912]], dtype=float32)>

In [63]:
grads_out.shape

TensorShape([10, 250])

In [64]:
X_layer_inp = _run_forward_to_layer(model, layer, orig_call, x_test_sample, run_forward_to_layer_input=True)

In [65]:
X_layer_inp = list(X_layer_inp)

In [66]:
grads_inp = _gradients_layer(model, 
                             layer, 
                             orig_call, 
                             orig_dummy_input, 
                             X_layer_inp[0], 
                             target, 
                             gradients_layer_inputs=True)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [67]:
grads_inp.shape

TensorShape([10, 5000])

In [68]:
grads_inp

<tf.Tensor: shape=(10, 5000), dtype=float32, numpy=
array([[ 0.00215829,  0.00053226, -0.00132799, ...,  0.00082246,
        -0.00292184, -0.00209101],
       [ 0.00397081,  0.00129692,  0.00639185, ..., -0.00112882,
        -0.00440302,  0.0033445 ],
       [-0.00123563,  0.00191363,  0.00069667, ..., -0.00538727,
        -0.00228617, -0.00169568],
       ...,
       [ 0.00401346, -0.00294157, -0.0004713 , ...,  0.00027939,
        -0.0032652 ,  0.00195899],
       [ 0.00205418,  0.0014013 ,  0.00123545, ..., -0.00320662,
        -0.00349352,  0.00186839],
       [ 0.00040617,  0.00227149, -0.00106775, ..., -0.00035076,
        -0.00024771,  0.00031978]], dtype=float32)>

In [193]:
orig_call = layer.call
orig_dummy_input = [np.zeros((1,) + inpp.shape[1:]) for inpp in inp]
target = [1 for _ in range(nb_samples)]

In [195]:
grads = _gradients_layer(frozenModelOut, layer, orig_call, orig_dummy_input, X_layer, target )

[<tf.Tensor: shape=(10, 100, 768), dtype=float32, numpy=
array([[[ 0.03096277, -1.328463  , -0.58381706, ..., -1.542637  ,
         -1.6382638 ,  1.5585074 ],
        [ 1.2325355 , -0.08492582, -2.1955416 , ..., -1.0991684 ,
         -0.85656124,  0.4230998 ],
        [ 2.0808983 , -2.5597546 , -0.48032314, ..., -2.0058136 ,
         -0.80445063,  1.6266632 ],
        ...,
        [ 2.8055623 , -1.453405  ,  0.88466865, ..., -2.4409838 ,
         -1.542648  ,  1.0648844 ],
        [-1.363974  , -0.9465396 , -2.29329   , ..., -0.30235538,
         -1.2025822 ,  0.765783  ],
        [ 2.048068  , -1.3881792 ,  1.1389545 , ..., -0.78310555,
         -0.54225874, -0.32534844]],

       [[-0.17857285, -1.4808518 , -0.613645  , ..., -0.76305103,
         -2.339498  , -0.4367423 ],
        [ 0.12191792, -1.9209031 , -1.2052633 , ..., -1.2629689 ,
         -1.2369899 ,  0.527245  ],
        [ 1.8495976 , -3.388928  ,  0.9084114 , ..., -0.3409986 ,
         -1.3057836 , -0.5260624 ],
        ..

In [198]:
grads[0]

<tf.Tensor: shape=(10, 100, 768), dtype=float32, numpy=
array([[[ 4.92075109e-04,  6.02714681e-05, -2.32987673e-04, ...,
          3.59852042e-04, -3.63110921e-05,  4.38389921e-04],
        [ 2.30642516e-04, -5.50971075e-04,  2.32170787e-04, ...,
         -7.54989014e-05, -1.28455751e-04, -5.20038127e-04],
        [ 3.85588964e-06,  9.03854816e-05, -3.36457997e-05, ...,
          1.92570878e-04,  5.98768820e-04, -9.02307729e-05],
        ...,
        [-8.38256019e-05, -2.25406053e-04,  5.53932368e-05, ...,
          3.71104310e-04, -1.60682655e-04, -7.47951155e-04],
        [ 2.16036497e-04, -2.48291064e-04, -4.66909805e-05, ...,
          1.11414547e-05,  1.36700692e-04,  1.63533186e-04],
        [-2.63755064e-04,  5.33933417e-05, -3.49554408e-04, ...,
         -9.18781079e-05, -1.38716758e-04, -5.60689543e-04]],

       [[ 4.75050125e-04,  4.77776957e-05, -3.33185104e-04, ...,
          3.04283283e-04, -1.69587787e-04,  4.77981550e-04],
        [ 1.33143709e-04, -5.38074994e-04,  1.1

# Feed layer

In [1]:
import tensorflow as tf
import numpy as np

In [2]:
def _run_forward_from_layer(model,
                            layer,
                            orig_call,
                            orig_dummy_input,
                            x,
                            run_from_layer_inputs = False):

    def feed_layer(layer):
        """
        Overwrites the intermediate layer status with the precomputed values `x`.

        """
        def decorator(func):
            def wrapper(*args, **kwargs):
                # Store the result and the inputs of `layer.call` internally.
                if run_from_layer_inputs:
                    layer.inp = x
                    layer.result = func(*x, **kwargs)
                else:
                    layer.inp = args
                    layer.result = x
                # Return the result to continue with the forward pass.
                return layer.result

            return wrapper

        layer.call = decorator(layer.call)
        return layer

    feed_layer(layer)
    preds = model(orig_dummy_input)

    delattr(layer, 'inp')
    delattr(layer, 'result')
    layer.call = orig_call

    return preds

In [3]:
initializer = tf.keras.initializers.Ones()

In [3]:
inputs = tf.keras.Input(shape=(16,))
out = tf.keras.layers.Dense(8, 
                            kernel_initializer=tf.keras.initializers.Ones(),
                            name='linear1')(inputs)
out = tf.keras.layers.Dense(1, 
                            kernel_initializer=tf.keras.initializers.Ones(), 
                            name='linear3')(out)
model = tf.keras.Model(inputs=inputs, outputs=out)

In [4]:
layer = model.layers[1]
layer
orig_call = layer.call

In [6]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 16)]              0         
_________________________________________________________________
linear1 (Dense)              (None, 8)                 136       
_________________________________________________________________
linear3 (Dense)              (None, 1)                 9         
Total params: 145
Trainable params: 145
Non-trainable params: 0
_________________________________________________________________


In [7]:
dummy_input = np.zeros((1, 16))
x_input_layer_1 = np.ones((2, 16))
x_output_layer_1 = np.ones((3, 8))

In [8]:
model(dummy_input)

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.]], dtype=float32)>

In [11]:
preds_from_layer_inputs = _run_forward_from_layer(model, 
                                                  layer, 
                                                  orig_call,
                                                  dummy_input,
                                                  [tf.convert_to_tensor(x_input_layer_1)],
                                                  run_from_layer_inputs=True)

In [12]:
preds_from_layer_inputs

<tf.Tensor: shape=(2, 1), dtype=float32, numpy=
array([[128.],
       [128.]], dtype=float32)>

In [13]:
preds_from_layer_output = _run_forward_from_layer(model,
                                                  layer,
                                                  orig_call,
                                                  dummy_input,
                                                  x_output_layer_1,
                                                  run_from_layer_inputs=False)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



In [14]:
preds_from_layer_output

<tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[8.],
       [8.],
       [8.]], dtype=float32)>