In [41]:
import pandas as pd 
import numpy as np 
import time 
import gc 
import random

In [38]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

from keras.models import Model
from keras.layers import Input, Dropout, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D, BatchNormalization, Conv1D, MaxPooling1D, Flatten
from keras.layers import Reshape
from keras.layers import CuDNNGRU
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
from keras import backend as K
from keras.models import Model

from keras import optimizers

from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

import warnings
warnings.filterwarnings('ignore')

import os
os.environ['OMP_NUM_THREADS'] = '4'

import threading
import multiprocessing
from multiprocessing import Pool, cpu_count
from contextlib import closing
cores = 4

from keras import backend as K
from keras.optimizers import RMSprop, Adam, Nadam
from keras.callbacks import ModelCheckpoint, EarlyStopping


In [63]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [26]:
from __future__ import absolute_import

import tensorflow as tf
from keras import backend as K
from keras import initializers
from keras import constraints
from keras import regularizers
from keras.engine import InputSpec, Layer

class Attention(Layer):
    def __init__(self,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        """
        Keras Layer that implements an Attention mechanism for temporal data.
        Supports Masking.
        Follows the work of Raffel et al. [https://arxiv.org/abs/1512.08756]
        # Input shape
            3D tensor with shape: `(samples, steps, features)`.
        # Output shape
            2D tensor with shape: `(samples, features)`.
        :param kwargs:
        Just put it on top of an RNN Layer (GRU/LSTM/SimpleRNN) with return_sequences=True.
        The dimensions are inferred based on the output shape of the RNN.
        Note: The layer has been tested with Keras 2.0.6
        Example:
            model.add(LSTM(64, return_sequences=True))
            model.add(Attention())
            # next add a Dense layer (for classification/regression) or whatever...
        """
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        eij = K.squeeze(K.dot(x, K.expand_dims(self.W)), axis=-1)

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        # apply mask after the exp. will be re-normalized next
        if mask is not None:
            # Cast the mask to floatX to avoid float64 upcasting in theano
            a *= K.cast(mask, K.floatx())

        # in some cases especially in the early stages of training the sum may be almost zero
        # and this results in NaN's. A workaround is to add a very small positive number ε to the sum.
        # a /= K.cast(K.sum(a, axis=1, keepdims=True), K.floatx())
        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[-1]
    
class AttentionWeightedAverage(Layer):
    """
    Computes a weighted average of the different channels across timesteps.
    Uses 1 parameter pr. channel to compute the attention value for a single timestep.
    """

    def __init__(self, return_attention=False, **kwargs):
        self.init = initializers.get('uniform')
        self.supports_masking = True
        self.return_attention = return_attention
        super(AttentionWeightedAverage, self).__init__(** kwargs)

    def build(self, input_shape):
        self.input_spec = [InputSpec(ndim=3)]
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[2], 1),
                                 name='{}_W'.format(self.name),
                                 initializer=self.init)
        self.trainable_weights = [self.W]
        super(AttentionWeightedAverage, self).build(input_shape)

    def call(self, x, mask=None):
        # computes a probability distribution over the timesteps
        # uses 'max trick' for numerical stability
        # reshape is done to avoid issue with Tensorflow
        # and 1-dimensional weights
        logits = K.dot(x, self.W)
        x_shape = K.shape(x)
        logits = K.reshape(logits, (x_shape[0], x_shape[1]))
        ai = K.exp(logits - K.max(logits, axis=-1, keepdims=True))

        # masked timesteps have zero weight
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            ai = ai * mask
        att_weights = ai / (K.sum(ai, axis=1, keepdims=True) + K.epsilon())
        weighted_input = x * K.expand_dims(att_weights)
        result = K.sum(weighted_input, axis=1)
        if self.return_attention:
            return [result, att_weights]
        return result

    def get_output_shape_for(self, input_shape):
        return self.compute_output_shape(input_shape)

    def compute_output_shape(self, input_shape):
        output_len = input_shape[2]
        if self.return_attention:
            return [(input_shape[0], output_len), (input_shape[0], input_shape[1])]
        return (input_shape[0], output_len)

    def compute_mask(self, input, input_mask=None):
        if isinstance(input_mask, list):
            return [None] * len(input_mask)
        else:
            return None

In [40]:
def generate_data(X, y, batch_size=64):
    # Берём  индексы
    # Перемешиваем
    # Семплируем по размеру батча
    idx = np.arange(X.shape[0])
    random.shuffle(idx)
    steps, last_batch_size = divmod(idx.shape[0]/batch_size)
    if last_batch_size:
        pass
    for i in range(steps):
        {
            'x': X[batch*i:batch*(i+1)],
            'y': y[batch*i:batch*(i+1)],
        }

In [54]:
EMBEDDING_DIM = 127
FEATURE_COUNT = 40
EMBEDDING_MATRIX = np.random.uniform(size=[ FEATURE_COUNT, EMBEDDING_DIM])

In [59]:
def create_model():
    
    series_id = Input(shape=[FEATURE_COUNT], name="series_id")
    series_values = Input(shape=[FEATURE_COUNT], name="series_values")
    series_values_reshape = Reshape([FEATURE_COUNT, 1])(series_values)
    emb_series_id = Embedding(
        FEATURE_COUNT,
        EMBEDDING_DIM,
        weights = [EMBEDDING_MATRIX], # Случайные? # Какое-нибудь разложение
        trainable = True
    )(series_id)
    emb_series_id = SpatialDropout1D(0.1)(emb_series_id)
    
    
    series = concatenate([
        emb_series_id,
        series_values_reshape 
    ])

    rnn_out = CuDNNGRU(64, return_sequences=True) (series)
    
    attention_rnn_weighted_average = AttentionWeightedAverage()(rnn_out)    
    # Здесь предварительно можно подумать о свёртках
    mean_rnn = GlobalAveragePooling1D()(rnn_out)
    max_rnn = GlobalMaxPooling1D()(rnn_out)
    attention_rnn = Attention()(rnn_out)
    
    
    main_l = concatenate([
        mean_rnn,
        max_rnn,
        attention_rnn,
        attention_rnn_weighted_average,
    ])
    
    main_l = BatchNormalization()(main_l)
    main_l = Dropout(0.5)(Dense(128, activation='relu') (main_l))
    main_l = BatchNormalization()(main_l)
    output = Dense(1,activation=None) (main_l)
    
    model = Model([series_id, series_values], output)
#     model.compile(optimizer = 'adam',
#                   loss= root_mean_squared_error,
#                   metrics = [root_mean_squared_error])
    return model

In [60]:
m = create_model()

In [62]:
m.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
series_id (InputLayer)          (None, 40)           0                                            
__________________________________________________________________________________________________
embedding_7 (Embedding)         (None, 40, 127)      5080        series_id[0][0]                  
__________________________________________________________________________________________________
series_values (InputLayer)      (None, 40)           0                                            
__________________________________________________________________________________________________
spatial_dropout1d_3 (SpatialDro (None, 40, 127)      0           embedding_7[0][0]                
__________________________________________________________________________________________________
reshape_6 

In [None]:
model_checkpoint = ModelCheckpoint(model_file, monitor='val_rmse', verbose=1, mode='min',
                                   save_best_only=True, save_weights_only=False, period=1)

clr = CyclicLR(base_lr=0.001, max_lr=0.01, step_size=2*math.ceil(y[trn_inx].shape[0]), mode='triangular2')
early_stop = EarlyStopping(monitor='val_rmse', min_delta=0, patience=6, verbose=1, mode='min')
# mse_eval = MSEEvaluationSeq(val_seq, y[val_inx], 'val')   


# Training
opt=optimizers.Nadam()
model.compile(optimizer=opt, loss='binary_crossentropy') # 

model.fit_generator(
    generator=trn_seq, 
    steps_per_epoch=len(trn_seq), #?
    initial_epoch=0,
    epochs=epochs, shuffle=False, verbose=2,
    callbacks=[mse_eval, model_checkpoint, clr],
    use_multiprocessing=False, workers=1, max_queue_size=4*cpu_cores)

# Predicting
print("\nPredicting fold {}".format(ifold))
del model, trn_seq, mse_eval, clr, early_stop, model_checkpoint
model = keras.models.load_model(
    model_file, 
    compile=True, 
    custom_objects={
        'Attention':Attention, 
        'AttentionWeightedAverage':AttentionWeightedAverage}
)
pred[val_inx] = np.clip(model.predict_generator(val_seq, steps=len(val_seq), 
                                                use_multiprocessing=False, workers=1, 
                                                max_queue_size=4*cpu_cores).ravel(), 0.0, 1.0)
del val_seq
gc.collect()

mse = metrics.mean_squared_error(y[val_inx], pred[val_inx])
rmse = np.sqrt(mse)
fold_mse.append(mse)
fold_rmse.append(rmse)
print("fold: {}, mse: {}, rmse: {}".format(ifold, mse, rmse))
print()

test_pred += np.clip(model.predict(split_inputs(test_cat_features)+\
                                   [test_num_features, 
                                    test_te,
                                    test_user_features, 
                                    test_image_features, 
                                    test_title_sequences[0],
                                    test_descs_sequences[0], 
                                    test_fasttext_features,
                                    test_char_title,
                                    test_char_descs
                                   ], 
                                   batch_size=INF_BATCH_SIZE, verbose=0).ravel(), 0.0, 1.0)/n_folds
    ifold += 1
    gc.collect()