In [None]:
import tensorflow as tf
import bz2
import tqdm
import re
import numpy as np

from sklearn.utils import shuffle
from keras.layers import *
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
train_file = "./data/amazonreviews/train.ft.txt.bz2"
test_file = "./data/amazonreviews/test.ft.txt.bz2"

In [None]:
def split_reviews_labels(lines):
    reviews = []
    labels = []
    for review in tqdm.tqdm(lines):
        rev = review_to_x(review)
        label = review_to_y(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels
        
        
def review_to_x(review):
    review = review.split(" ", 1)[1][:-1].lower()
    review = re.sub("\d","0",review)
    
    if "www." in review or "http" in review or "https:" in review or ".com" in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review


def review_to_y(review):
    return [1,0] if review.split(" ")[0] == "__label__1" else [0,1]

In [None]:
train_file = bz2.BZ2File(train_file)
test_file = bz2.BZ2File(test_file)

train_lines = train_file.readlines()
test_lines = test_file.readlines()

train_lines = [x.decode("utf-8") for x in train_lines]
test_lines = [x.decode("utf-8") for x in test_lines]

reviews_train, y_train = split_reviews_labels(train_lines)
reviews_test, y_test = split_reviews_labels(test_lines)

In [None]:
reviews_train, y_train = split_reviews_labels(train_lines)

In [None]:
reviews_train, y_train = shuffle(reviews_train, y_train)
reviews_test, y_test = shuffle(reviews_test, y_test)

In [None]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
max_features = 256 #8192
maxlen = 128
embed_size = 64

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(reviews_train)

x_train = tokenizer.text_to_sequences(reviews_train)
x_test = tokenizer.text_to_sequences(reviews_test)

In [None]:
class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)
        
    def get_config(self):
        config = {
            'return_attention': self.return_attention,
        }
        base_config = super(AttentionWeightedAverage, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [None]:
feature_output = False
embed_dropout_rate = 0
final_dropout_rate = 0
embed_l2 = 1e-6
return_attention = False

In [None]:
def get_model(maxlen, feature_output = False, embed_dropout_rate=0,
             final_dropout_rate=0, embed_l2=1e-6, return_attention=False):
    input = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size)(input)
    x = Activation("tanh")(x)
    lstm_0_output = Bidirectional(LSTM(512, return_sequences=True), name="bi_lstm_0")(x)
    lstm_1_output = Bidirectional(LSTM(512, return_sequences=True), name="bi_lstm_1")(lstm_0_output)
    x = concatenate([lstm_1_output, lstm_0_output, x])

    # if return_attention is True in AttentionWeightedAverage, an additional tensor
    # representing the weight at each timestep is returned
    weights = None
    x = AttentionWeightedAverage(name='attlayer', return_attention=return_attention)(x)
    if return_attention:
        x, weights = x

    if not feature_output:
        # output class probabilities
        if final_dropout_rate != 0:
            x = Dropout(final_dropout_rate)(x)
    else:
        # output penultimate feature vector
        outputs = [x]

    if return_attention:
        # add the attention weights to the outputs if required
        outputs.append(weights)

    model = Model(inputs=[model_input], outputs=outputs, name="Sentiment Model")
    model.complie(optimizer="adam", loss="categorical_crossentropy", metrics=["acc"])
    return model


In [None]:
model = get_model(maxlen, return_attention=False)
model.summary()

In [None]:
model.fit(x_train, y_train, batch_size=2048, epochs=5, validation_split=0.1)

In [None]:
model.save_weights("sentiment_model.h5")

In [None]:
model2 = get_model(maxlen, return_attension=True)
model2.load_weights("sentiment_model.h5")

In [None]:
from IPython.display import HTML as html_print

def amplify_difference(x):
    x = (x - np.mean(x))*100
    return (np.exp(x) / np.sum(np.exp(x), axis=0) * 100).astype(int)


def color_print(s, strength, sentiment=None):
    h = min(100 - sentiment[0], sentiment[1])
    return "<text style='color:white; background-color:hsl({}, 100%, {}%)'>{} </text>".format(h, 50-strength, s)

In [None]:
sent = ["This page is awesome"]

inp = tokenizer.texts_to_sequence(sent)
out, attn = model2.predict(inp)

temp_text = tokenizer.sequence_to_texts(inp)

strength = rescale(attn[0].tolist()[:len(seq[0])])

for i in range(len(tmp_text[0].split())):
    string += cstr(tmp_text[0].split()[i], strength[i], out[0])
    
html_print(string)