In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint, TensorBoard
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from collections import defaultdict
from sklearn.metrics import roc_auc_score

In [None]:
def read_data():
    df_train = pd.read_pickle('../../data/new/train.pkl')
    df_val = pd.read_pickle('../../data/new/val.pkl')
    df_test = pd.read_pickle('../../data/new/test.pkl')
    
    print('Train: {} samples'.format(df_train.shape[0]))
    print('Val: {} samples'.format(df_val.shape[0]))
    print('Test: {} samples'.format(df_test.shape[0]))
    
    return df_train, df_val, df_test

## Preprocessing: fastText inspired model for tiltle and keywords

In [None]:
def concat_keyword_title(df):
    df.keywords = df.keywords.apply(' ,'.join)
    df['concat_keywords_title'] = df.keywords + df.title
    
    return df.concat_keywords_title.values
    

In [None]:
def preprocess(text):
    text = text.replace("' ", " ' ")
    signs = set(',.:;"?!')
    prods = set(text) & signs
    if not prods:
        return text

    for sign in prods:
        text = text.replace(sign, ' {} '.format(sign) )
    return text

In [None]:
def create_docs(df, n_gram_max=2):
    def add_ngram(q, n_gram_max):
            ngrams = []
            for n in range(2, n_gram_max+1):
                for w_index in range(len(q)-n+1):
                    ngrams.append('--'.join(q[w_index:w_index+n]))
            return q + ngrams
        
    docs = []
    for doc in df:
        doc = preprocess(doc).split()
        docs.append(' '.join(add_ngram(doc, n_gram_max)))
    
    return docs

In [None]:
def tokenizer_title(docs,min_count=2, tokenizer=None):
    '''
    Args:
        docs: list of texts, the first element must be the training data
    '''
    if tokenizer == None:
        tokenizer = Tokenizer(lower=True, filters='')
        tokenizer.fit_on_texts(docs[0])
        num_words = sum([1 for _, v in tokenizer.word_counts.items() if v >= min_count])
        
        tokenizer = Tokenizer(num_words = num_words, lower=True, filters='')
        tokenizer.fit_on_texts(docs[0])
    
    return [tokenizer.texts_to_sequences(doc) for doc in docs], tokenizer

In [None]:
def padding(docs, maxlen):
    return [pad_sequences(sequences=doc, maxlen=maxlen) for doc in docs ]

## Preprocessing: LSTM for text inspired by BLSTM

In [None]:
EMBEDDING_FILE = '/Users/thesuguser/Desktop/kaggle/toxic_comment/embeddings/crawl-300d-2M.vec'

In [None]:
def get_embedding_matrix(embedding_file, max_features, embeded_size, tokenizer):
    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(embedding_file))
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.zeros((nb_words, embeded_size))
    for word, i in word_index.items():
        if i>= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [None]:
def tokenizer_text(docs, max_features, maxlen, tokenizer=None,):
    '''
    args*:
        docs: list of texts, the first element must be the training data
    '''
    if tokenizer==None:
        tokenizer = Tokenizer(num_words=max_features)
        tokenizer.fit_on_texts(docs[0])
    docs = [tokenizer.texts_to_sequences(doc) for doc in docs]
    
    return [pad_sequences(doc, maxlen = maxlen) for doc in docs], tokenizer

## Preprocessing

### Parameters for title and keywords

In [None]:
min_count = 2
maxlen_1 = 51

### Parameters for texts

In [None]:
max_features_2 = 20000
embed_size  = 300
maxlen_2 = 155

In [None]:
df_train, df_val, df_test = read_data()

In [None]:
title_train = concat_keyword_title(df_train)
title_val = concat_keyword_title(df_val)
title_test = concat_keyword_title(df_test)

title_train = create_docs(title_train)
title_val = create_docs(title_val)
title_test = create_docs(title_test)

[title_train, title_val, title_test], tokenizer_1 = tokenizer_title([title_train, title_val, title_test], min_count=min_count)
title_train, title_val, title_test = padding([title_train, title_val, title_val],maxlen=maxlen_1)

In [None]:
title_train = np.array(title_train)
title_val = np.array(title_val)
title_test = np.array(title_test)

In [None]:
max_features_1 = np.max(title_train) + 1

In [None]:
[text_train, text_val, text_test], tokenizer_2 = tokenizer_text([df_train.article.values,
                                                  df_val.article.values,
                                                  df_test.article.values],
                                                max_features=max_features_2,
                                                maxlen=maxlen_2)

In [None]:
embedding_matrix = get_embedding_matrix(EMBEDDING_FILE, 
                                        max_features=max_features_2, 
                                        embeded_size=embed_size,
                                        tokenizer=tokenizer_2)

In [None]:
y_train, y_val, y_test = [pd.get_dummies(d.popularity).values for d in [df_train, df_val, df_test]]

## Model

In [None]:
class RocAucMetricCallback(Callback):
    def __init__(self, predict_batch_size=1024, include_on_batch=False):
        super(RocAucMetricCallback, self).__init__()
        self.predict_batch_size=predict_batch_size
        self.include_on_batch=include_on_batch
    
    def on_batch_begin(self, batch, logs={}):
        pass
    
    def on_batch_end(self, batch, logs={}):
        if(self.include_on_batch):
            logs['roc_auc_val']=float('-inf')
            if(self.validation_data):
                logs['roc_auc_val']=roc_auc_score(self.validation_data[2], 
                                                  self.model.predict({'text_input':self.validation_data[0],
                                                                      'title_input':self.validation_data[1]},
                                                                     batch_size=self.predict_batch_size))
    
    def on_train_begin(self, logs={}):
        if not ('roc_auc_val' in self.params['metrics']):
            self.params['metrics'].append('roc_auc_val')
    
    def on_train_end(self, logs={}):
        pass

    def on_epoch_begin(self, epoch, logs={}):
        pass
    
    def on_epoch_end(self, epoch, logs={}):
        logs['roc_auc_val']=float('-inf')
        if(self.validation_data):
            score = roc_auc_score(self.validation_data[2], 
                                  self.model.predict({'text_input':self.validation_data[0],
                                                      'title_input':self.validation_data[1]},
                                                       batch_size=self.predict_batch_size))
            logs['roc_auc_val']=score
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))

In [None]:
from keras import backend as K
import tensorflow as tf

def focal_loss(gamma=2., alpha=.25):
    """
    Computing the focal loss
    
    Args: 
        gamma: the tunable focusing parameter
        alpha: the balance parameter
    Return:
        the value of the focal loss
    """
    def focal_loss_fixed(y_true, y_pred):
        """
        Compute the focal loss with the prediction result and the ground truth
        
        Args:
            y_true: the ground truth
            y_pred: the probabilities of the prediction
        """
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
        return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1))-K.sum((1-alpha) * K.pow( pt_0, gamma) * K.log(1. - pt_0))
    return focal_loss_fixed

In [None]:
def get_model(maxlen_1, maxlen_2, max_features_1, max_features_2):
    """
    Construct the computational graph of the deep learning model
    
    Args:
        maxlen_1: the maximum length of the title&keywords inputs
        maxlen_2: the maximum length of the body text inputs
        max_features_1: the maxmimum number of the feature vector of the title&keyword inputs
        max_features_2: the maxmimum number of the feature vector of the body text inputs
    Returns:
        the keras model
    """
    
    text_input = Input(shape=(maxlen_2,), dtype='int32', name='text_input')
    # Embedding layer for body text
    emb_1 = Embedding(max_features_2, output_dim=300, weights=[embedding_matrix])(text_input)
    x1 = SpatialDropout1D(0.2)(emb_1)
    # Feture extractor
    x1 = Bidirectional(GRU(80, return_sequences=True))(x1)
    avg_pool = GlobalAveragePooling1D()(x1)
    max_pool = GlobalMaxPooling1D()(x1)
    
    title_input = Input(shape=(maxlen_1,), dtype='int32', name='title_input')
    # Embedding layer for title and keywords
    emb_2 =Embedding(max_features_1, output_dim=20)(title_input)
    # Feature extractor
    x2 = GlobalAveragePooling1D()(emb_2)
    
    conc = concatenate([x2, avg_pool, max_pool])
    
    # Classifier
    fc = Dense(64, activation='relu')(conc)
    output = Dense(3, activation='softmax')(fc)
    
    model = Model(inputs=[text_input, title_input], outputs=output)
    model.summary()

    model.compile(loss=[focal_loss(gamma=2., alpha=0.25)],
                  optimizer = 'adam', 
                  metrics = ['accuracy'])
    
    return model

In [None]:
model = get_model()

In [None]:
tbCallBack = TensorBoard(log_dir='output/', histogram_freq=0, write_graph=True, write_images=True)
cb = [
    RocAucMetricCallback(include_on_batch=False), # include it before EarlyStopping!
    EarlyStopping(monitor='roc_auc_val',patience=5, verbose=2,mode='max'),
    tbCallBack,
    ModelCheckpoint('weights.{epoch:02d}-{val_loss:.2f}.hdf5', monitor='roc_auc_val', verbose=1)    
]

In [None]:
hist = model.fit({'text_input':text_train, 'title_input':title_train}, 
                 y_train,
                 batch_size = 128,
                 validation_data = ([text_val, title_val],y_val),
                 epochs = 100,
                 callbacks = cb,
                 verbose=1)