In [None]:
import datetime, pickle, os, codecs, re, string
import json
import random
import numpy as np
import keras
from keras.models import *
from keras.layers import *
from keras.optimizers import *
from keras.callbacks import *
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import backend as K
from keras.utils import CustomObjectScope
from keras.engine.topology import Layer
from keras.engine import InputSpec

from keras import initializers as initializers, regularizers, constraints

import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from transformers import AutoModel, AutoTokenizer
import string
from spacy.lang.en import English
import gensim, logging

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import en_core_web_sm

from IPython.display import HTML, display

import tensorflow as tf

from numpy.random import seed
from nltk.tokenize import sent_tokenize,word_tokenize

In [None]:
word_embedding_type = "from_scratch" 
word_vector_model = "fasttext" 
rnn_type = "GRU" 
learning_rate = 0.001
epochs = 8
batch_size = 64

**Text Preprocessing**

In [None]:
def clean_str(string):
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    cleanr = re.compile('<.*?>')
    string = re.sub(cleanr, '', string)
    string = string.replace('_', '')
    return string.strip().lower()

**Use pre-trained word embeddings**

In [None]:
def load_subword_embedding_300d(word_index):
    print('load_subword_embedding...')
    embeddings_index = {}
    f = codecs.open("../input/fasttext-english-embeddings/wiki-news-300d-1M-subword.vec", encoding='utf-8')
    for line in tqdm(f):
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('found %s word vectors' % len(embeddings_index))
    
    #embedding matrix
    print('preparing embedding matrix...')
    words_not_found = []
    
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if (embedding_vector is not None) and len(embedding_vector) > 0:
            
            embedding_matrix[i] = embedding_vector
        else:
            words_not_found.append(word)
    
    return embedding_matrix

**Normalize texts**

In [None]:
def normalize(text):
    text = text.lower().strip()
    doc = sent_tokenize(text)
    filtered_sentences = []
    for sentence in doc:                    
        sentence = clean_str(sentence)            
        #sentence = remove_stopwords(sentence)                
        filtered_sentences.append(sentence)
    return filtered_sentences

**Training word embeddings**

In [None]:
def create_fasttext(embed_dim, data):
    
    filename = './fasttext_model.txt'
    
    if not os.path.isfile(filename):    
        print('create_fasttext...')
        sent_lst = []

        for doc in data['excerpt']:
            doc = clean_str(doc)
            sentences = sent_tokenize(doc)
            for sent in sentences:
                word_lst = [w for w in word_tokenize(sent) if w.isalnum()]
                sent_lst.append(word_lst)


        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

        fasttext_model = gensim.models.FastText(
            word_ngrams=1,
            sentences=sent_lst, 
            vector_size = embed_dim, 
            workers=os.cpu_count(), 
            window = 1)
        fasttext_model.save("./fasttext_model.txt")

In [None]:
def load_data_commonlit(data_dir):
    vector_dim = 200
    dim = 5
    df_train = pd.read_csv(data_dir + 'train.csv')
    df_test = pd.read_csv(data_dir + 'test.csv')
    train_tokens = []
    test_tokens = []
    for row in tqdm(df_train['excerpt']):    
        train_tokens.append(normalize(row))  
    for row in tqdm(df_test['excerpt']):    
        test_tokens.append(normalize(row))
    df_train['train_tokens'] = train_tokens 
    df_test['test_tokens'] = test_tokens 
    del train_tokens
    del test_tokens
    vector_dim = 200     
    if word_embedding_type is 'from_scratch':
        create_fasttext(vector_dim, df_train)
    
    X = df_train['train_tokens'].values
    Y = df_train['target'].values
    yy=[]
    for val in Y:
        yy.append([val])
    Y = np.array(yy)
    Z = df_test['test_tokens'].values
    return (X, Y , df_test)

In [None]:
(X, Y ,test) = load_data_commonlit("../input/commonlitreadabilityprize/")

**Attention Layer**

In [None]:

def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatibl|e with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)

class AttentionWithContext(Layer):
    """
    Attention operation, with a context/query vector, for temporal data.
    Supports Masking.
    Follows the work of Yang et al. [https://www.cs.cmu.edu/~diyiy/docs/naacl16.pdf]
    "Hierarchical Attention Networks for Document Classification"
    by using a context vector to assist the attention
    # Input shape
        3D tensor with shape: `(samples, steps, features)`.
    # Output shape
        2D tensor with shape: `(samples, features)`.
    How to use:
    Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
    The dimensions are inferred based on the output shape of the RNN.
    Note: The layer has been tested with Keras 2.0.6
    Example:
        model.add(LSTM(64, return_sequences=True))
        model.add(AttentionWithContext())
        # next add a Dense layer (for classification/regression) or whatever...
    """
    def __init__(self,
                 W_regularizer=None, u_regularizer=None, b_regularizer=None,
                 W_constraint=None, u_constraint=None, b_constraint=None,
                 bias=True, **kwargs):

        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.u_regularizer = regularizers.get(u_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.u_constraint = constraints.get(u_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(AttentionWithContext, self).__init__(**kwargs)
    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1], input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight(shape=(input_shape[-1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)

        self.u = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_u'.format(self.name),
                                 regularizer=self.u_regularizer,
                                 constraint=self.u_constraint)

        super(AttentionWithContext, self).build(input_shape)
    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        uit = dot_product(x, self.W)

        if self.bias:
            uit += self.b

        uit = K.tanh(uit)
        ait = dot_product(uit, self.u)

        a = K.exp(ait)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)
    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]

**Model architecture**

In [None]:
class HAHNetwork():
    def __init__(self):
        self.model = None
        self.MAX_SENTENCE_LENGTH = 0
        self.MAX_SENTENCE_COUNT = 0
        self.VOCABULARY_SIZE = 0
        self.word_embedding = None
        self.model = None
        self.word_attention_model = None
        self.tokenizer = None
        self.class_count = 1

    def build_model(self, n_classes=1, embedding_dim=200, embeddings_path=False):
        
        l2_reg = regularizers.l2(0.001)
        
        embedding_weights = np.random.normal(0, 1, (len(self.tokenizer.word_index) + 1, embedding_dim))
        
        if embeddings_path is not None:

            if word_embedding_type is 'from_scratch':
                # FastText
                filename = './fasttext_model.txt'                
                model =  gensim.models.FastText.load(filename)

                embeddings_index = model.wv                    
                embedding_matrix = np.zeros( ( len(self.tokenizer.word_index) + 1, embedding_dim) )
                #print(self.tokenizer.word_index.items())
                for word, i in self.tokenizer.word_index.items():
                    try:
                        embedding_vector = embeddings_index[word]
                        if embedding_vector is not None:
                            embedding_matrix[i] = embedding_vector
                    except Exception as e:
                        #print(str(e))
                        continue


            else:                
                embedding_dim = 300
                embedding_matrix = load_subword_embedding_300d(self.tokenizer.word_index)
            embedding_weights = embedding_matrix
        sentence_in = Input(shape=(self.MAX_SENTENCE_LENGTH,), dtype='float32', name="input_1")
        embedding_trainable = True
        
        
        
        if word_embedding_type is 'pre_trained':
            embedding_trainable = False
        
        embedded_word_seq = Embedding(
            self.VOCABULARY_SIZE,
            embedding_dim,
            weights=[embedding_weights],
            input_length=self.MAX_SENTENCE_LENGTH,
            trainable=embedding_trainable,
            #mask_zero=True,
            mask_zero=False,
            name='word_embeddings',)(sentence_in) 
        
                    
        dropout = Dropout(0.2)(embedded_word_seq)
        filter_sizes = [3,4,5]
        convs = []
        for filter_size in filter_sizes:
            conv = Conv1D(filters=64, kernel_size=filter_size, padding='same', activation='relu')(dropout)
            pool = MaxPool1D(filter_size)(conv)
            convs.append(pool)
        
        concatenate = Concatenate(axis=1)(convs)
        
        if rnn_type is 'GRU':
            #word_encoder = Bidirectional(CuDNNGRU(50, return_sequences=True, dropout=0.2))(concatenate)                
            dropout = Dropout(0.1)(concatenate)
            word_encoder = Bidirectional(GRU(100, return_sequences=True))(dropout)                
        else:
            word_encoder = Bidirectional(LSTM(100, return_sequences=True, dropout=0.2))(embedded_word_seq)
        dense_transform_word = Dense(
            100, 
            activation='relu', 
            name='dense_transform_word', 
            kernel_regularizer=l2_reg)(word_encoder)
        # word attention
        attention_weighted_sentence = Model(sentence_in, AttentionWithContext(name="word_attention")(dense_transform_word))
        #attention_weighted_sentence = Model(sentence_in, dense_transform_word)
        
        self.word_attention_model = attention_weighted_sentence
        
        attention_weighted_sentence.summary()

        # sentence-attention-weighted document scores
        
        texts_in = Input(shape=(self.MAX_SENTENCE_COUNT, self.MAX_SENTENCE_LENGTH), dtype='float32', name="input_2")
        
        attention_weighted_sentences = TimeDistributed(attention_weighted_sentence)(texts_in)
        
        
        if rnn_type is 'GRU':
            #sentence_encoder = Bidirectional(GRU(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.2))(attention_weighted_sentences)
            dropout = Dropout(0.1)(attention_weighted_sentences)
            sentence_encoder = Bidirectional(GRU(100, return_sequences=True))(dropout)
        else:
            sentence_encoder = Bidirectional(LSTM(100, return_sequences=True, dropout=0.1, recurrent_dropout=0.2))(attention_weighted_sentences)
        
        
        dense_transform_sentence = Dense(
            100, 
            activation='relu', 
            name='dense_transform_sentence',
            kernel_regularizer=l2_reg)(sentence_encoder)
        
        # sentence attention
        attention_weighted_text = AttentionWithContext(name="sentence_attention")(dense_transform_sentence)
        #attention_weighted_text = dense_transform_sentence
        
        
        prediction = Dense(n_classes, activation='linear')(attention_weighted_text)
        
        model = Model(texts_in, prediction)
        model.summary()
        optimizer=Adam(lr=learning_rate, decay=0.0001)

        model.compile(
                      optimizer=optimizer,
                      loss='mse',
                      metrics=['accuracy'])

        return model


    def get_tokenizer_filename(self, saved_model_filename):
        return saved_model_filename + '.tokenizer'

    def fit_on_texts(self, texts):
        self.tokenizer = Tokenizer(filters='"()*,-/;[\]^_`{|}~', oov_token='UNK');
        all_sentences = []
        max_sentence_count = 0
        max_sentence_length = 0
        for text in texts:
            sentence_count = len(text)
            if sentence_count > max_sentence_count:
                max_sentence_count = sentence_count
            for sentence in text:
                sentence_length = len(sentence)
                if sentence_length > max_sentence_length:
                    max_sentence_length = sentence_length
                all_sentences.append(sentence)


        self.MAX_SENTENCE_COUNT = min(max_sentence_count,30)
        self.MAX_SENTENCE_LENGTH = min(max_sentence_length,75)
        
        self.tokenizer.fit_on_texts(all_sentences)
        self.VOCABULARY_SIZE = len(self.tokenizer.word_index) + 1
        self.create_reverse_word_index()

    def create_reverse_word_index(self):
        self.reverse_word_index = {value:key for key,value in self.tokenizer.word_index.items()}

    def encode_texts(self, texts):
        encoded_texts = np.zeros((len(texts), self.MAX_SENTENCE_COUNT, self.MAX_SENTENCE_LENGTH))
        for i, text in enumerate(texts):
            encoded_text = np.array(pad_sequences(
                self.tokenizer.texts_to_sequences(text), 
                maxlen=self.MAX_SENTENCE_LENGTH))[:self.MAX_SENTENCE_COUNT]
            encoded_texts[i][-len(encoded_text):] = encoded_text
        return encoded_texts
    def save_tokenizer_on_epoch_end(self, path, epoch):
        if epoch == 0:
            tokenizer_state = {
                'tokenizer': self.tokenizer,
                'maxSentenceCount': self.MAX_SENTENCE_COUNT,
                'maxSentenceLength': self.MAX_SENTENCE_LENGTH,
                'vocabularySize': self.VOCABULARY_SIZE
            }
            pickle.dump(tokenizer_state, open(path, "wb" ) )
    def train(self, train_x, train_y,
              batch_size=16, 
              epochs=1, 
              embedding_dim=200, 
              embeddings_path=False, 
              saved_model_dir='saved_models', 
              saved_model_filename=None,):
        
        self.fit_on_texts(train_x)
        self.model = self.build_model(
            n_classes=1, 
            embedding_dim=200,
            embeddings_path=embeddings_path)
        encoded_train_x = self.encode_texts(train_x)
        callbacks = [
            ReduceLROnPlateau(),
            LambdaCallback(
                on_epoch_end=lambda epoch, logs: self.save_tokenizer_on_epoch_end(
                    os.path.join(saved_model_dir, 
                        self.get_tokenizer_filename(saved_model_filename)), epoch))
        ]

        if saved_model_filename:
            callbacks.append(
                ModelCheckpoint(
                    filepath=os.path.join(saved_model_dir, saved_model_filename),
                    monitor='val_acc',
                    save_best_only=True,
                    save_weights_only=False,
                )
            )
        history = self.model.fit(
                       x=encoded_train_x, 
                       y=train_y, 
                       batch_size=batch_size, 
                       epochs=epochs, 
                       verbose=1, 
                       validation_split=0.1,  
                       shuffle=True)
    def predict(self, x):
        encoded_x = self.encode_texts(x)
        return self.model.predict(encoded_x)
    def encode_input(self, x, log=False):
        x = np.array(x)
        if not x.shape:
            x = np.expand_dims(x, 0)
        texts = np.array([normalize(text) for text in x])
        return self.encode_texts(texts)

In [None]:
K.clear_session()
model = HAHNetwork()
model.train(X, Y, batch_size=128, epochs=20, embeddings_path=True, saved_model_dir=None, saved_model_filename=None)

In [None]:
preds = model.predict(test['test_tokens'].values)

In [None]:
test['prediction'] = preds
submission = pd.DataFrame()
submission['id'] = test['id'].copy()
submission['target'] = test['prediction'].copy()
submission.to_csv('submission.csv', index=False)
submission.head()