In [1]:
#from data_utils import data
import pandas as pd
import numpy as np
import re
import time

## Preprocessing

In [2]:
# Define the alphabet used in vectorization
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for idx, char in enumerate(alphabet):
    char_dict[char] = idx + 1

In [3]:
def strToIndex(s, char_dict, input_size):
    """
    Convert charater to index
    
    Args:
        s: string
        char_dic: character dictionary
        input_size: the lenght of the feature vector
    Returns:
        the index of the string
    """
    s = s.lower()
    m = len(s)
    n = min(m, input_size)
    str2index = np.zeros(input_size, dtype='int32')
    for i in range(0, n):
        c = s[i]
        if c in char_dict:
            str2index[i] = char_dict[c]
    return str2index

In [4]:
def load_data(path, 
              char_dict = char_dict,
              input_size=1014):
    """
    Load the data and vectorize the original text
    
    Args:
        path: the path the data
        char_dict" the character dictionary
        input_size: the length of the feature vector
    Returns:
        vectorized text and the one-hot encoding ground truth
    """
    
    char_dict = {}
    for idx, char in enumerate(alphabet):
        char_dict[char] = idx + 1
    df = pd.read_pickle(path)
    context = df.article.values
    title = df.title.values
    text = []
    for i in range(len(context)):
        c = ""
        t = ""
        c = c + re.sub("^\s*(.-)\s*$", "%1", context[i]).replace("\\n", "\n")
        #for t in title[i]:
        t = t + " " + re.sub("^\s*(.-)\s*$", "%1", title[i]).replace("\\n", "\n")
        s = strToIndex(t+c, char_dict=char_dict, input_size=input_size)
        text.append(s)
    #str2idx = [strToIndex(s) for s in text]
    return np.array(text), pd.get_dummies(df.popularity).values

In [5]:
x_train, y_train = load_data('../../data/new/train.pkl')
x_val, y_val = load_data('../../data/new/val.pkl')
x_test, y_test = load_data('../../data/new/test.pkl')

## Deep Learning Model

In [8]:
from keras.models import Model
from keras.layers import Input, Dense, Flatten, GlobalMaxPool1D
from keras.layers import Convolution1D, MaxPool1D, Embedding
from keras.layers import ThresholdedReLU, Dropout, Concatenate
from keras.layers import AlphaDropout
from keras.callbacks import Callback, TensorBoard, EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [9]:
# Model configuration
INPUT_SIZE = 1014
ALPHABET_SIZE = len(alphabet)
EMBEEDING_SIZE = 128
CONV_LAYER = [ [256,10], [256,7],[256,5], [256,3]]
BATCH_SIZE = 128

In [10]:
def get_model(CONV_LAYER):
    """
    Construct the computational graph of the char-CNN
    
    Args:
        CONV_LAYER: the configuration of the convolutional layer
    Returns:
        Keras implemented model
    """
    inputs = Input(shape=(INPUT_SIZE, ), dtype='int64')
    # Embedding Layers
    x = Embedding(ALPHABET_SIZE+1, EMBEEDING_SIZE)(inputs)
    
    # Convolutional Layer
    convoluyion_output = []
    for num_filters, filter_width in CONV_LAYER:
        conv = Convolution1D(filters=num_filters,
                             kernel_size=filter_width,
                             activation='tanh')(x)
        pool = GlobalMaxPool1D()(conv)
        convoluyion_output.append(pool)
    x = Concatenate()(convoluyion_output)
    
    x = Dense(1024, activation='selu', kernel_initializer='lecun_normal')(x)
    x = Dense(1024, activation='selu', kernel_initializer='lecun_normal')(x)
    
    # Output layer
    predictions = Dense(3, activation='softmax')(x)
    
    model = Model(inputs=inputs, outputs=predictions)
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.summary()
    return model

In [11]:
model = get_model(CONV_LAYER)

Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 1014)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1014, 128)    8960        input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 1005, 256)    327936      embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 1008, 256)    229632      embedding_1[0][0]                
__________________________

In [12]:
from sklearn.metrics import roc_auc_score

class RocAucMetricCallback(Callback):
    """
    Define a new callback to compute the roc auc score during the training process
    """
    def __init__(self, predict_batch_size=1024, include_on_batch=False):
        super(RocAucMetricCallback, self).__init__()
        self.predict_batch_size=predict_batch_size
        self.include_on_batch=include_on_batch
 
    def on_batch_begin(self, batch, logs={}):
        pass
 
    def on_batch_end(self, batch, logs={}):
        if(self.include_on_batch):
            logs['roc_auc_val']=float('-inf')
            if(self.validation_data):
                logs['roc_auc_val']=roc_auc_score(self.validation_data[1], 
                                                  self.model.predict(self.validation_data[0],
                                                                     batch_size=self.predict_batch_size))
 
    def on_train_begin(self, logs={}):
        if not ('roc_auc_val' in self.params['metrics']):
            self.params['metrics'].append('roc_auc_val')
 
    def on_train_end(self, logs={}):
        pass
 
    def on_epoch_begin(self, epoch, logs={}):
        pass
 
    def on_epoch_end(self, epoch, logs={}):
        logs['roc_auc_val']=float('-inf')
        if(self.validation_data):
            score = roc_auc_score(self.validation_data[1], 
                                              self.model.predict(self.validation_data[0],
                                                                 batch_size=self.predict_batch_size))
            logs['roc_auc_val']=score
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [13]:
tbCallBack = TensorBoard(log_dir='../../output/char_cnn', histogram_freq=0, write_graph=True, write_images=True)
cb = [
    RocAucMetricCallback(), # include it before EarlyStopping!
    EarlyStopping(monitor='roc_auc_val',patience=5, verbose=2,mode='max'),
    tbCallBack,
    ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='roc_auc_val', verbose=1)    
]

Instructions for updating:
Use the retry module or similar alternatives.


In [14]:
start = time.clock()

hist = model.fit(x_train, y_train,
                 batch_size = BATCH_SIZE,
                 epochs = 100,
                 validation_data = (x_val, y_val),
                 callbacks=cb,
                 verbose=1)
print('Trianing time:', time.clock()-start)

model.save('best_model.h5')

Train on 74996 samples, validate on 8333 samples
Epoch 1/100

 ROC-AUC - epoch: 1 - score: 0.770658 


Epoch 00001: saving model to weights.01-0.73.hdf5
Epoch 2/100

 ROC-AUC - epoch: 2 - score: 0.778597 


Epoch 00002: saving model to weights.02-0.78.hdf5
Epoch 3/100

 ROC-AUC - epoch: 3 - score: 0.791468 


Epoch 00003: saving model to weights.03-0.74.hdf5
Epoch 4/100

 ROC-AUC - epoch: 4 - score: 0.795115 


Epoch 00004: saving model to weights.04-0.71.hdf5
Epoch 5/100

 ROC-AUC - epoch: 5 - score: 0.796412 


Epoch 00005: saving model to weights.05-0.72.hdf5
Epoch 6/100

 ROC-AUC - epoch: 6 - score: 0.787895 


Epoch 00006: saving model to weights.06-0.83.hdf5
Epoch 7/100

 ROC-AUC - epoch: 7 - score: 0.784886 


Epoch 00007: saving model to weights.07-0.76.hdf5
Epoch 8/100

 ROC-AUC - epoch: 8 - score: 0.768410 


Epoch 00008: saving model to weights.08-0.89.hdf5
Epoch 9/100

 ROC-AUC - epoch: 9 - score: 0.771850 


Epoch 00009: saving model to weights.09-0.88.hdf5
Epoch 10/100

 

In [15]:
pred_train = model.predict(x_train, batch_size=1024)

In [17]:
pred_test = model.predict(x_test, batch_size=1024)

In [18]:
pred_test.shape

(4188, 3)

In [19]:
np.save('pred_train.npy',pred_train)
np.save('pred_test.npy', pred_test)