In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
import time
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from gensim.models import FastText, Word2Vec
import re
from keras.layers import *
from keras.models import *
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence
from keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import keras.backend as K
from keras.optimizers import *
from keras.utils import to_categorical

dataPath = '../dataSet/'

Using TensorFlow backend.


In [2]:
# Read train datas
train = pd.read_csv(dataPath+'age_train.csv', names=['uid','age'])
test = pd.read_csv(dataPath+'age_test.csv', names=['uid'])

app_package = pd.read_csv(dataPath+'user_app_actived.csv', names=['uid','appid'])

In [3]:
app_package['app_len'] = app_package['appid'].apply(lambda x:x.spilt('#')).apply(lambda x:len(x))
train = pd.merge(train, app_package, on='uid', how='left')
test = pd.merge(test, app_package, on='uid', how='left')

In [None]:
# Perform Word to Vec embedding
do_embedding = False
do_fast_model = False
embedding_size = 128

if do_fast_model:
    fast_model = Word2Vec(list(app_package['app_list']), size=embedding_size, window=4, min_count=3, negative=2,
                         sg=1, sample=0.002, hs=1, workers=8)
    fast_model.save(dataPath + 'nn/fastmodel.model')
else:
    fast_model = Word2Vec.load(dataPath + 'nn/fastmodel.model')
    
if do_embedding:
    embedding_fast = pd.DataFrame([fast_model[word] for word in (fast_model.wv.vocab)])
    embedding_fast['app'] = list(fast_model.wv.vocab)
    embedding_fast.columns = ['fast_dim_%s' % str(i) for i in range(embedding_size)] + ['app']
    embedding_fast.to_csv(dataPath + 'embedding_fast.csv')
else:
    embedding_fast = pd.read_csv(dataPath + 'embedding_fast.csv')

In [None]:
tokenizer = Tokenizer(lower=False, char_level=False, split='#')
tokenizer.fit_on_texts(list(app_package['appid']))

X_seq = tokenizer.texts_to_sequences(train['appid'])
X_test_seq = tokenizer.texts_to_sequences(test['appid'])

X = pad_sequences(X_seq, maxlen=50, value=0)
X_test = pad_sequences(X_test_seq, maxlen=50, value=0)

max_features = 10000
embedding_matrix = np.zeros((max_features, embedding_size))
for word in tokenizer.word_index:
    if word not in fast_model.wv.vocab:
        continue
    embedding_matrix[tokenizer.word_index[word]] = fast_model[word]

In [None]:
# Define Nerual Network Layer functions
class AdamW(Optimizer):
    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999, weight_decay=1e-4,
                 epsilon=1e-8, decay=0., **kwargs):
        super(AdamW, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.lr = K.variable(lr, name='lr')
            self.beta_1 = K.variable(beta_1, name='beta_1')
            self.beta_2 = K.variable(beta_2, name='beta_2')
            self.decay = K.variable(decay, name='decay')
            # decoupled weight decay (2/4)
            self.wd = K.variable(weight_decay, name='weight_decay')
        self.epsilon = epsilon
        self.initial_decay = decay

    @interfaces.legacy_get_updates_support
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        wd = self.wd  # decoupled weight decay (3/4)

        lr = self.lr
        if self.initial_decay > 0:
            lr *= (1. / (1. + self.decay * K.cast(self.iterations,
                                                  K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)
            # decoupled weight decay (4/4)
            p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon) - lr * wd * p

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    def get_config(self):
        config = {'lr': float(K.get_value(self.lr)),
                  'beta_1': float(K.get_value(self.beta_1)),
                  'beta_2': float(K.get_value(self.beta_2)),
                  'decay': float(K.get_value(self.decay)),
                  'weight_decay': float(K.get_value(self.wd)),
                  'epsilon': self.epsilon}
        base_config = super(AdamW, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [None]:
def model_age_conv(embedding_matrix):

    K.clear_session()
    # The embedding layer containing the word vectors
    emb_layer = Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        weights=[embedding_matrix],
        input_length=maxlen,
        trainable=False)

    # Define inputs
    seq = Input(shape=(maxlen,))

    # Run inputs through embedding
    emb = emb_layer(seq)

    lstm_layer = Bidirectional(GRU(128, recurrent_dropout=0.15, dropout=0.15,))
    lstm = lstm_layer(emb)

    translate = TimeDistributed(Dense(128, activation='relu'))
    t1 = translate(emb)
    t1 = TimeDistributed(Dropout(0.15))(t1)
    sum_op = Lambda(lambda x: K.sum(x, axis=1), output_shape=(128,))
    t1 = sum_op(t1)

    lstm_layer2 = Bidirectional(
        GRU(128, recurrent_dropout=0.15, dropout=0.15, return_sequences=True))

    # 1D convolutions that can iterate over the word vectors
    conv1_2 = Conv1D(filters=128, kernel_size=2,
                     padding='same', activation='relu',)

    lstm2 = lstm_layer2(emb)
    # Run through CONV + GAP layers
    conv1a2 = conv1_2(lstm2)
    gap1a2 = GlobalAveragePooling1D()(conv1a2)
    gmp1a2 = GlobalMaxPool1D()(conv1a2)

    # 1D convolutions that can iterate over the word vectors
    conv1 = Conv1D(filters=128, kernel_size=1,
                   padding='same', activation='relu',)
    conv2 = Conv1D(filters=64, kernel_size=2,
                   padding='same', activation='relu', )
    conv3 = Conv1D(filters=64, kernel_size=3,
                   padding='same', activation='relu',)
    conv4 = Conv1D(filters=64, kernel_size=4,
                   padding='same', activation='relu',)
    conv5 = Conv1D(filters=32, kernel_size=5,
                   padding='same', activation='relu',)
    conv6 = Conv1D(filters=32, kernel_size=6,
                   padding='same', activation='relu',)

    # Run through CONV + GAP layers
    conv1a = conv1(emb)
    gap1a = GlobalAveragePooling1D()(conv1a)
    gmp1a = GlobalMaxPool1D()(conv1a)

    conv2a = conv2(emb)
    gap2a = GlobalAveragePooling1D()(conv2a)
    gmp2a = GlobalMaxPool1D()(conv2a)

    conv3a = conv3(emb)
    gap3a = GlobalAveragePooling1D()(conv3a)
    gmp3a = GlobalMaxPooling1D()(conv3a)

    conv4a = conv3(emb)
    gap4a = GlobalAveragePooling1D()(conv4a)
    #gmp3a = GlobalMaxPooling1D()(conv3a)

    conv5a = conv5(emb)
    gap5a = GlobalAveragePooling1D()(conv5a)
    gmp5a = GlobalMaxPooling1D()(conv5a)

    conv6a = conv6(emb)
    gap6a = GlobalAveragePooling1D()(conv6a)

    #hin = Input(shape=(X_h.shape[1], ))
    #htime = Dense(X_h.shape[1]//4, activation='relu')(hin)

    merge1 = concatenate([gap1a2, gmp1a2, lstm, t1])

    # The MLP that determines the outcome
    x = Dropout(0.38)(merge1)
    #x = BatchNormalization()(x)
    #x = Dense(200, activation='relu',)(x)
    #x = Dropout(0.22)(x)
    x = BatchNormalization()(x)
    x = Dense(200, activation='relu',)(x)
    x = Dropout(0.22)(x)
    x = BatchNormalization()(x)
    x = Dense(200, activation='relu',)(x)
    x = Dropout(0.22)(x)
    x = BatchNormalization()(x)
    pred = Dense(11, activation='softmax')(x)

    model = Model(inputs=seq, outputs=pred)
    model.compile(loss='categorical_crossentropy',
                  optimizer=AdamW(weight_decay=0.1,))

    return model

In [None]:
Y_age = to_categorical(train['age'])

kfold = StratifiedKFold(n_splits=5, shuffle=True)
sub = np.zeros((X_test.shape[0], 7))
oof_pref = np.zeros((X.shape[0], 7))
score = []
count = 0

for i, (train_index, test_index) in enumerate(kfold.split(X, train['age'])):

    print("FOLD | ", count+1)

    filepath = "age_weights_best_%d.h5" % count
    checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=2, save_best_only=True, mode='min')
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss', factor=0.8, patience=2, min_lr=0.0001, verbose=1)
    earlystopping = EarlyStopping(
        monitor='val_loss', min_delta=0.0001, patience=8, verbose=1, mode='auto')
    callbacks = [checkpoint, reduce_lr, earlystopping]

    X_tr, X_vl, y_tr, y_vl = X[train_index], X[test_index], Y_age[train_index], Y_age[test_index]

    model_age = model_age_conv(embedding_matrix)
    hist = model_age.fit(X_tr, y_tr, batch_size=512, epochs=20, validation_data=(X_vl, y_vl),
                         callbacks=callbacks, verbose=2, shuffle=True)

    model_age.load_weights(filepath)
    oof_pref[test_index] = model_age.predict(X_vl)
    sub += model_age.predict(X_test) / kfold.n_splits
    score.append(np.min(hist.history['val_loss']))
    count += 1
print('log loss:', np.mean(score))