In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import time

%matplotlib inline

In [2]:
from collections import defaultdict

import keras
import keras.backend as K
from keras.layers import Dense, GlobalAveragePooling1D, Embedding
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.callbacks import Callback, TensorBoard, EarlyStopping, ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

Using TensorFlow backend.


## Preprocessing

In [3]:
train = pd.read_pickle('../../data/new/train.pkl')
val = pd.read_pickle('../../data/new/val.pkl')
test = pd.read_pickle('../../data/new/test.pkl')

In [4]:
train.article = train.title + ' ' + train.article
val.article = val.title + ' ' + val.article
test.article = test.title + ' ' + test.article

In [5]:
def preprocess(text):
    """
    Clean the text
    
    Args:
        text: the original text
    Returns:
        text: the cleaned text
    """
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [6]:
def create_docs(df, n_gram_max=2):
    """
    Convert the origina text to the combination of unigrams and bigrams
    
    Args:
        df: Dataframe 
        n_gram_max: the maximum of the n-grams kept in the data
    Returns:
        Combination of unigram and bigram
    """
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

In [7]:
train_docs = create_docs(train.article.values)
val_docs = create_docs(val.article.values)
test_docs = create_docs(test.article.values)

In [8]:
min_count = 2
tokenizer = Tokenizer(lower=True, filters='')
tokenizer.fit_on_texts(train_docs)
num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])

tokenizer = Tokenizer(num_words=num_words, lower=True, filters='')
tokenizer.fit_on_texts(train_docs)
train_docs = tokenizer.texts_to_sequences(train_docs)
val_docs = tokenizer.texts_to_sequences(val_docs)
test_docs = tokenizer.texts_to_sequences(test_docs)

In [9]:
# print(np.mean([len(doc) for doc in train_docs]))
print(np.mean([len(doc) for doc in train_docs]))

371.88590058136435


In [10]:
maxlen = 371
x_train = pad_sequences(sequences=train_docs, maxlen=maxlen)
x_val = pad_sequences(sequences=val_docs, maxlen=maxlen)
x_test = pad_sequences(sequences=test_docs, maxlen=maxlen)

In [11]:
y_train = pd.get_dummies(train.popularity).values
y_val = pd.get_dummies(val.popularity).values
y_test = pd.get_dummies(test.popularity).values

In [12]:
from sklearn.metrics import roc_auc_score
class RocAucMetricCallback(Callback):
    """
    Define a new callback to compute the roc auc score during the training process
    """
    def __init__(self, predict_batch_size=1024, include_on_batch=False):
        super(RocAucMetricCallback, self).__init__()
        self.predict_batch_size=predict_batch_size
        self.include_on_batch=include_on_batch
 
    def on_batch_begin(self, batch, logs={}):
        pass
 
    def on_batch_end(self, batch, logs={}):
        if(self.include_on_batch):
            logs['roc_auc_val']=float('-inf')
            if(self.validation_data):
                logs['roc_auc_val']=roc_auc_score(self.validation_data[1], 
                                                  self.model.predict(self.validation_data[0],
                                                                     batch_size=self.predict_batch_size))
 
    def on_train_begin(self, logs={}):
        if not ('roc_auc_val' in self.params['metrics']):
            self.params['metrics'].append('roc_auc_val')
 
    def on_train_end(self, logs={}):
        pass
 
    def on_epoch_begin(self, epoch, logs={}):
        pass
 
    def on_epoch_end(self, epoch, logs={}):
        logs['roc_auc_val']=float('-inf')
        if(self.validation_data):
            score = roc_auc_score(self.validation_data[1], 
                                              self.model.predict(self.validation_data[0],
                                                                 batch_size=self.predict_batch_size))
            logs['roc_auc_val']=score
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [13]:
# Training configuration
tbCallBack = TensorBoard(log_dir='../../output/fasttext', histogram_freq=0, write_graph=True, write_images=True)
cb = [
    RocAucMetricCallback(), # include it before EarlyStopping!
    EarlyStopping(monitor='roc_auc_val',patience=5, verbose=2,mode='max'),
    tbCallBack,
    ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='roc_auc_val', verbose=1)    
]

Instructions for updating:
Use the retry module or similar alternatives.


In [14]:
input_dim = np.max(x_train) + 1
embedding_dims = 20

In [15]:
def get_model(input_dim, embedding_dims=20, optimizer='adam'):
    """
    Construct the computational graph of the fasttext
    
    Args:
        input_dim: the dimension of the input vector
        embedding_dims: the dimension of the embedding layer
        optimizer: the optimizer used to optimize the loss function
    Returns:
        The Keras implemented model
    """
    model = Sequential()
    model.add(Embedding(input_dim = input_dim, output_dim=embedding_dims))
    model.add(GlobalAveragePooling1D())
    model.add(Dense(3,activation='softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer = optimizer, 
                  metrics = ['accuracy'])
    return model

In [16]:
model = get_model(input_dim=input_dim)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 20)          21204700  
_________________________________________________________________
global_average_pooling1d_1 ( (None, 20)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 63        
Total params: 21,204,763
Trainable params: 21,204,763
Non-trainable params: 0
_________________________________________________________________


In [17]:
start = time.clock()
hist = model.fit(x_train, 
                 y_train,
                 batch_size = 64,
                 validation_data = (x_val,y_val),
                 epochs = 100,
                 callbacks = cb,
                 verbose=1)
print('Training time:', time.clock() - start)

Train on 74996 samples, validate on 8333 samples
Epoch 1/100

 ROC-AUC - epoch: 1 - score: 0.745814 


Epoch 00001: saving model to weights.01-0.77.hdf5
Epoch 2/100

 ROC-AUC - epoch: 2 - score: 0.797263 


Epoch 00002: saving model to weights.02-0.71.hdf5
Epoch 3/100

 ROC-AUC - epoch: 3 - score: 0.822195 


Epoch 00003: saving model to weights.03-0.67.hdf5
Epoch 4/100

 ROC-AUC - epoch: 4 - score: 0.829257 


Epoch 00004: saving model to weights.04-0.65.hdf5
Epoch 5/100

 ROC-AUC - epoch: 5 - score: 0.830785 


Epoch 00005: saving model to weights.05-0.66.hdf5
Epoch 6/100

 ROC-AUC - epoch: 6 - score: 0.829388 


Epoch 00006: saving model to weights.06-0.67.hdf5
Epoch 7/100

 ROC-AUC - epoch: 7 - score: 0.825295 


Epoch 00007: saving model to weights.07-0.71.hdf5
Epoch 8/100

 ROC-AUC - epoch: 8 - score: 0.821719 


Epoch 00008: saving model to weights.08-0.74.hdf5
Epoch 9/100

 ROC-AUC - epoch: 9 - score: 0.818743 


Epoch 00009: saving model to weights.09-0.78.hdf5
Epoch 10/100

 

In [18]:
model.save('best_model.h5')

In [18]:
pred_train = model.predict(x_train, batch_size=1024)
pred_test = model.predict(x_test, batch_size=1024)

In [24]:
np.save('pred_train.npy',pred_train)
np.save('pred_test.npy', pred_test)