In [None]:
import torch
import torch.nn as nn

class MLPAttention(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MLPAttention, self).__init__()

        self.query_projection = nn.Linear(input_dim, hidden_dim)
        self.key_projection = nn.Linear(input_dim, hidden_dim)
        self.value_projection = nn.Linear(input_dim, hidden_dim)
        self.output_projection = nn.Linear(hidden_dim, output_dim)

    def forward(self, query, keys, values):
        query_projection = self.query_projection(query)
        key_projection = self.key_projection(keys)
        value_projection = self.value_projection(values)

        attention_weights = torch.matmul(query_projection, key_projection.transpose(-1, -2))
        attention_weights = torch.softmax(attention_weights, dim=-1)

        attention_output = torch.matmul(attention_weights, value_projection)
        attention_output = self.output_projection(attention_output)

        return attention_output

In [2]:
from keras import backend as K
from keras import initializers, regularizers, constraints
from keras.layers import Layer


class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Example:
            # 1
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
            # 2
            hidden = LSTM(64, return_sequences=True)(words)
            sentence = Attention()(hidden)
            # next add a Dense layer (for classification/regression) or whatever...
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')
        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0

        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(name='{}_W'.format(self.name),
                                 shape=(input_shape[-1],),
                                 initializer=self.init,
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(name='{}_b'.format(self.name),
                                     shape=(input_shape[1],),
                                     initializer='zero',
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None
        
        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        e = K.reshape(K.dot(K.reshape(x, (-1, features_dim)), K.reshape(self.W, (features_dim, 1))), (-1, step_dim))  # e = K.dot(x, self.W)
        if self.bias:
            e += self.b
        e = K.tanh(e)

        a = K.exp(e)
        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())
        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
        a = K.expand_dims(a)

        c = K.sum(a * x, axis=1)
        return c
    
    def compute_output_shape(self, input_shape):
        return input_shape[0], self.features_dim

In [4]:
from keras import Input, Model
from keras.layers import Embedding, Dense, Bidirectional, TimeDistributed
# from keras.layers import CuDNNGRU
from keras.layers import GRU
import keras

class HCAN(Model):
    def __init__(self,
                 maxlen_sentence,
                 maxlen_word,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        super(HCAN, self).__init__()
        self.maxlen_sentence = maxlen_sentence
        self.maxlen_word = maxlen_word
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        # Word part
        input_word = Input(shape=(self.maxlen_word,))
        x_word = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen_word)(input_word)
        x_word = keras.layers.Convolution1D(100, 10, activation="relu", padding = 'same')(x_word)
        # x_word = layers.GlobalMaxPool1D()(x_word)

        x_word = Bidirectional(GRU(128, return_sequences=True))(x_word)  # LSTM or GRU
        x_word = Attention(self.maxlen_word)(x_word)
        model_word = Model(input_word, x_word)
        # Sentence part
        self.word_encoder_att = TimeDistributed(model_word)
        self.sentence_encoder = Bidirectional(GRU(128, return_sequences=True))  # LSTM or GRU
        self.sentence_att = Attention(self.maxlen_sentence)
        # Output part
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
            if len(inputs.get_shape()) != 3:
                raise ValueError('The rank of inputs of HAN must be 3, but now is %d' % len(inputs.get_shape()))
            if inputs.get_shape()[1] != self.maxlen_sentence:
                raise ValueError('The maxlen_sentence of inputs of HAN must be %d, but now is %d' % (self.maxlen_sentence, inputs.get_shape()[1]))
            if inputs.get_shape()[2] != self.maxlen_word:
                raise ValueError('The maxlen_word of inputs of HAN must be %d, but now is %d' % (self.maxlen_word, inputs.get_shape()[2]))
            x_sentence = self.word_encoder_att(inputs)
            x_sentence = self.sentence_encoder(x_sentence)
            x_sentence = self.sentence_att(x_sentence)
            output = self.classifier(x_sentence)
            return output

In [6]:
import pandas as pd
df = pd.read_csv("./data.csv")

df.dropna(subset=['Body'], inplace=True)
df.reset_index(drop=True, inplace=True)

df['Headline'] = df['Headline'].str.lower()
df['Body'] = df['Body'].str.lower()
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,four ways bob corker skewered donald trump,image copyright getty images\non sunday mornin...,1
1,https://www.reuters.com/article/us-filmfestiva...,linklater's war veteran comedy speaks to moder...,"london (reuters) - “last flag flying”, a comed...",1
2,https://www.nytimes.com/2017/10/09/us/politics...,trump’s fight with corker jeopardizes his legi...,the feud broke into public view last week when...,1
3,https://www.reuters.com/article/us-mexico-oil-...,egypt's cheiron wins tie-up with pemex for mex...,mexico city (reuters) - egypt’s cheiron holdin...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,jason aldean opens 'snl' with vegas tribute,"country singer jason aldean, who was performin...",1


In [7]:
## Text Preprocessing
import string
def remove_punctuations(text):
    punctuations = set(string.punctuation)
    text = str(text)
    # return text.translate(str.maketrans('', '', punctuations))
    return " ".join([word for word in text.split() if word not in punctuations])

df['Headline'] = df['Headline'].apply(lambda x: remove_punctuations(x))
df['Body'] = df['Body'].apply(lambda x: remove_punctuations(x))

from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in STOPWORDS])

df['Headline'] = df['Headline'].apply(lambda x: remove_stopwords(x))
df['Body'] = df['Body'].apply(lambda x: remove_stopwords(x))

import re
def remove_spl_chars(text):
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

df['Headline'] = df['Headline'].apply(lambda x: remove_spl_chars(x))
df['Body'] = df['Body'].apply(lambda x: remove_spl_chars(x))


from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer(language='english')
def stem(text):
    stemmed_sentence = " ".join(stemmer.stem(word) for word in text.split())
    return stemmed_sentence

df['Headline'] = df['Headline'].apply(lambda x: stem(x))
df['Body'] = df['Body'].apply(lambda x: stem(x))


def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '', text)

df['Headline'] = df['Headline'].apply(lambda x: remove_url(x))
df['Body'] = df['Body'].apply(lambda x: remove_url(x))

In [9]:
labels = []
for i in range(3988):
    labels.append(df['Label'][i])
len(labels)

3988

In [10]:
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from nltk import tokenize
paras = []
texts = []
sent_lens = []
sent_nums = []
for idx in range(df.Body.shape[0]):
    text = df.Body[idx]
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    sent_nums.append(len(sentences))
    for sent in sentences:
        sent_lens.append(len(text_to_word_sequence(sent)))
    paras.append(sentences)

In [15]:
from keras.callbacks import EarlyStopping
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from keras.datasets import imdb
from keras.preprocessing import sequence
import pandas as pd
import numpy
from keras.preprocessing import text, sequence
from sklearn import metrics

In [11]:
max_features = 200000
maxlen_sentence = 100
maxlen_word = 25
batch_size = 32
embedding_dims = 100

In [None]:
max_words = 10000
max_seq_length = 1000
# the percentage of train test split to be applied
validation_split = 0.2
# the dimension of vectors to be used
embedding_dim = 100
# filter sizes of the different conv layers 
filter_sizes = [3,4,5]
num_filters = 512
embedding_dim = 100
# dropout probability
drop = 0.5
batch_size = 30
epochs = 2
max_features = 200000
max_senten_len=100
max_senten_num=6

In [13]:
embeddings_index = {}
for i, line in enumerate(open('./glove.6B.100d.txt', encoding='utf8')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

In [16]:
token = text.Tokenizer()
token.fit_on_texts(df['Body'])
word_index = token.word_index

In [19]:
len(word_index)

32035

In [20]:
df.head()

Unnamed: 0,URLs,Headline,Body,Label
0,http://www.bbc.com/news/world-us-canada-414191...,four way bob corker skewer donald trump,imag copyright getti imag sunday morn donald t...,1
1,https://www.reuters.com/article/us-filmfestiva...,linklat s war veteran comedi speak modern amer...,london reuter last flag fli comedi drama vietn...,1
2,https://www.nytimes.com/2017/10/09/us/politics...,trump s fight corker jeopard legisl agenda,feud broke public view last week mr corker sai...,1
3,https://www.reuters.com/article/us-mexico-oil-...,egypt s cheiron win tie up pemex mexican onsho...,mexico citi reuter egypt s cheiron hold limit ...,1
4,http://www.cnn.com/videos/cnnmoney/2017/10/08/...,jason aldean open snl vega tribut,countri singer jason aldean perform las vega s...,1


In [21]:
from sklearn.model_selection import train_test_split
y_df = df['Label'].values
x_df = df.drop(columns=['Label', 'Headline', 'URLs'], axis=1)
x_df.head()

Unnamed: 0,Body
0,imag copyright getti imag sunday morn donald t...
1,london reuter last flag fli comedi drama vietn...
2,feud broke public view last week mr corker sai...
3,mexico citi reuter egypt s cheiron hold limit ...
4,countri singer jason aldean perform las vega s...


In [79]:
x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=1)
x_train.shape

(3190, 1)

In [74]:
print(len(x_train))
len(x_test)

3190


798

In [80]:
x_train.Body.shape[0]

3190

In [102]:
X_train = []
for text in x_train.Body:
    X_train.append(text)

X_test = []
for text in x_test.Body:
    X_test.append(text)

In [103]:

X_train = sequence.pad_sequences(token.texts_to_sequences(X_train),  maxlen=2500)

X_test = sequence.pad_sequences(token.texts_to_sequences(X_test), maxlen=2500)

In [104]:
len(X_train)

3190

In [105]:
len(X_train[0])

2500

In [99]:
embedding_matrix = numpy.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [106]:
X_train = X_train.reshape((len(X_train), maxlen_sentence, maxlen_word))
X_test = X_test.reshape((len(X_test), maxlen_sentence, maxlen_word))
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)

x_train shape: (3190, 100, 25)
x_test shape: (798, 100, 25)


In [107]:
from keras.callbacks import ModelCheckpoint
epochs = 1
print('Build model...')
model = HCAN(maxlen_sentence, maxlen_word, max_features, embedding_dims)
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(X_test, y_test))

print('Test...')
result = model.predict(X_test)
     

Build model...
Train...
Test...


In [None]:
result

In [109]:
result = (result.reshape(1,-1)[0]>0.5).astype(int)

In [110]:
from sklearn import metrics
print("ACC", metrics.accuracy_score(result, y_test))
print("Prec", metrics.precision_score(result, y_test))
print("REC", metrics.recall_score(result, y_test))
print("F1", metrics.f1_score(result, y_test))

ACC 0.9273182957393483
Prec 0.8753246753246753
REC 0.9711815561959655
F1 0.9207650273224044
