In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from random import shuffle

%matplotlib inline

In [20]:
from nltk.tokenize import word_tokenize
import collections
from keras.preprocessing import sequence
from keras.preprocessing import text

In [21]:
#from data_utils import *
#from attention_rnn import *
import datetime
import time
import sys
import os

## Preprocessing

In [22]:
def clean_str(text):
    """
    Clean the text
    
    Args:
        original text
    Returns:
        cleaned text
    """
    text = re.sub(r"[^A-Za-z0-9(),!?\'\`\"]", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = text.strip().lower()

    return text

In [23]:
def buildWordDict(PATH):
    """
    Build the word dictionary
    
    Args:
        the path of the input
    Returns:
        word dictionary
    """
	if not os.path.exists("word_dict.pickle"):
		train_df = pd.read_pickle(PATH)
		contents = train_df['article']

		words = list()
		for content in contents:
			for word in word_tokenize(clean_str(content)):
				words.append(word)

		word_counter = collections.Counter(words).most_common()
		word_dict = dict()
		word_dict["<pad>"] = 0
		word_dict["<unk>"] = 1
		word_dict["<eos>"] = 2
		for word, _ in word_counter:
			word_dict[word] = len(word_dict)

		with open("word_dict.pickle", "wb") as f:
			pickle.dump(word_dict,f)

	else:
		with open("word_dict.pickle", "rb") as f:
			word_dict = pickle.load(f)

	return word_dict

In [24]:
def build_word_dataset(series, word_dict, max_len):
	series = series.sample(frac=1)
	x = list(map(lambda d: word_tokenize(clean_str(d)), series))
	x = list(map(lambda d: list(map(lambda w: word_dict.get(w, word_dict["<unk>"]), d)), x))
	x = list(map(lambda d: d + [word_dict["<eos>"]], x))
	x = list(map(lambda d: d[:document_max_len], x))
	x = list(map(lambda d: d + (document_max_len - len(d)) * [word_dict["<pad>"]], x))
	
	return x

In [25]:
def sequence_vectorize(train_texts, val_texts,test_texts, max_len, max_feature):
    """
    Vectorize the text
    
    Args:
        train_texts: training data
        val_texts: validation data
        test_texts: testing data
        max_len: maximum length of the input
        max_features: maximum length of the feature vector
    Returns:
        vectorized data and the tokenizer
    """

    tokenizer = text.Tokenizer(num_words = max_feature)
    tokenizer.fit_on_texts(train_texts)

    x_train = tokenizer.texts_to_sequences(train_texts)
    x_val = tokenizer.texts_to_sequences(val_texts)
    x_test = tokenizer.texts_to_sequences(test_texts)

    if len(max(x_train, key=len)) < max_len:
        max_len = len(max(x_train, key=len))

    x_train = sequence.pad_sequences(x_train, maxlen= max_len)
    x_val = sequence.pad_sequences(x_val, maxlen=max_len)
    x_test = sequence.pad_sequences(x_test, maxlen=max_len)

    return x_train, x_val, x_test, tokenizer.word_index

In [26]:
train_df = pd.read_pickle('../../data/new/train.pkl')
val_df = pd.read_pickle('../../data/new/val.pkl')
test_df = pd.read_pickle('../../data/new/test.pkl')

train_df['text'] = train_df.title +' '+ train_df.article
val_df['text'] = val_df.title + ' '+ val_df.article
test_df['text'] = test_df.title + ' ' + test_df.article

In [27]:
MAX_LEN = 131
MAX_FEATURE = 20000

In [28]:
x_train, x_val,x_test, word_dict = sequence_vectorize(train_df.text.values, 
                                               val_df.text.values,
                                               test_df.text.values,
                                               MAX_LEN,
                                               MAX_FEATURE)

In [29]:
y_train = pd.get_dummies(train_df.popularity).values
y_val = pd.get_dummies(val_df.popularity).values
y_test = pd.get_dummies(test_df.popularity).values

## Deep Model

In [30]:
VOCOB_SIZE = len(word_dict)
BATCH_SIZE = 128
#EVALUATE_EVERY = 100
#CHECKPOINT_EVERY = 100
EPOCHES = 100
#learning_rate = 1e-3
EMBED_SIZE = 256
NUM_HIDDEN = 256

In [31]:
from keras.models import Sequential
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Flatten
from keras.layers import Input, Dense, Embedding, concatenate, Dropout
from keras.callbacks import Callback

from keras import backend as K
from keras.layers import Layer
from keras import initializers, regularizers, constraints
from keras.callbacks import EarlyStopping, TensorBoard, Callback, ModelCheckpoint

In [32]:
from sklearn.metrics import roc_auc_score

class RocAucMetricCallback(Callback):
    """
    Define a new callback to compute the roc auc score during the training process
    """
    def __init__(self, predict_batch_size=1024, include_on_batch=False):
        super(RocAucMetricCallback, self).__init__()
        self.predict_batch_size=predict_batch_size
        self.include_on_batch=include_on_batch
 
    def on_batch_begin(self, batch, logs={}):
        pass
 
    def on_batch_end(self, batch, logs={}):
        if(self.include_on_batch):
            logs['roc_auc_val']=float('-inf')
            if(self.validation_data):
                logs['roc_auc_val']=roc_auc_score(self.validation_data[1], 
                                                  self.model.predict(self.validation_data[0],
                                                                     batch_size=self.predict_batch_size))
 
    def on_train_begin(self, logs={}):
        if not ('roc_auc_val' in self.params['metrics']):
            self.params['metrics'].append('roc_auc_val')
 
    def on_train_end(self, logs={}):
        pass
 
    def on_epoch_begin(self, epoch, logs={}):
        pass
 
    def on_epoch_end(self, epoch, logs={}):
        logs['roc_auc_val']=float('-inf')
        if(self.validation_data):
            score = roc_auc_score(self.validation_data[1], 
                                              self.model.predict(self.validation_data[0],
                                                                 batch_size=self.predict_batch_size))
            logs['roc_auc_val']=score
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [33]:
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
 
class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """
 
    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
 
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')
 
        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)
 
        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)
 
        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)
 
    def build(self, input_shape):
        assert len(input_shape) == 3
 
        self.W = self.add_weight((input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
 
        self.u = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)
 
        super(AttentionWithContext, self).build(input_shape)
 
    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None
 
    def call(self, x, mask=None):
        uit = dot_product(x, self.W)
 
        if self.bias:
            uit += self.b
 
        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)
 
        a = K.exp(ait)
 
        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())
 
        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
 
        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)
 
    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

In [34]:
tbCallBack = TensorBoard(log_dir='../../output/bilstm', histogram_freq=0, write_graph=True, write_images=True)
cb = [
    RocAucMetricCallback(), # include it before EarlyStopping!
    EarlyStopping(monitor='roc_auc_val',patience=5, verbose=2,mode='max'),
    tbCallBack,
    ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='roc_auc_val', verbose=1)    
]

In [35]:
# Keras Model
def get_model(attention=True):
    """
    Construct the computational graph
    
    Args:
        attention: if use the attention mechanism or not
    Returns:
        Keras implemented model
    """
    
    model = Sequential()
    model.add(Embedding(MAX_FEATURE, EMBED_SIZE, embeddings_initializer='uniform'))
    if attention:
        model.add(Bidirectional(LSTM(units=NUM_HIDDEN, dropout=0.5, return_sequences=True)))
        model.add(Bidirectional(LSTM(units=NUM_HIDDEN, dropout=0.5)))
        model.add(Dropout(0.5))
    else:
        model.add(Bidirectional(LSTM(units=NUM_HIDDEN, dropout=0.5, return_sequences=True)))
        model.add(Bidirectional(LSTM(units=NUM_HIDDEN, dropout=0.5, rerurn_sequences=True)))
        model.add(AttentionWithContext())
    #model.add(Flatten())
    
    model.add(Dense(3, activation='softmax'))

    
    model.summary()
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    
    return model

In [36]:
model = get_model()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 256)         5120000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 512)         1050624   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 512)               1574912   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 1539      
Total params: 7,747,075
Trainable params: 7,747,075
Non-trainable params: 0
_________________________________________________________________


In [37]:
start = time.clock()
model.fit(x_train, y_train,
          batch_size = BATCH_SIZE,
          epochs = EPOCHES,
          validation_data = (x_val, y_val),
          callbacks=cb,
          verbose=1)

print("Training time:" , time.clock() - start)

Train on 74996 samples, validate on 8333 samples
Epoch 1/100

 ROC-AUC - epoch: 1 - score: 0.778054 


Epoch 00001: saving model to weights.01-0.45.hdf5
Epoch 2/100

 ROC-AUC - epoch: 2 - score: 0.786083 


Epoch 00002: saving model to weights.02-0.44.hdf5
Epoch 3/100

 ROC-AUC - epoch: 3 - score: 0.780753 


Epoch 00003: saving model to weights.03-0.45.hdf5
Epoch 4/100

 ROC-AUC - epoch: 4 - score: 0.768836 


Epoch 00004: saving model to weights.04-0.49.hdf5
Epoch 5/100

 ROC-AUC - epoch: 5 - score: 0.758676 


Epoch 00005: saving model to weights.05-0.56.hdf5
Epoch 6/100

 ROC-AUC - epoch: 6 - score: 0.751912 


Epoch 00006: saving model to weights.06-0.65.hdf5
Epoch 7/100

 ROC-AUC - epoch: 7 - score: 0.736891 


Epoch 00007: saving model to weights.07-0.76.hdf5
Epoch 00007: early stopping
Training time: 2493.1315


In [38]:
model.save('best_model.h5')

In [39]:
pred_test = model.predict(x_test, 1024)
pred_train = model.predict(x_train, 1024)

In [40]:
np.save('pred_test.npy', pred_test)
np.save('pred_train.npy',pred_train)