In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

import cPickle as pkl
from sklearn.metrics import roc_auc_score
import csv
import numpy as np
np.set_printoptions(precision=2, linewidth=130)
import os

os.environ["THEANO_FLAGS"] = ("device=cpu,floatX=float32")

from keras.callbacks import Callback, EarlyStopping, LearningRateScheduler
from keras.optimizers import SGD, Adam
from keras.models import Model, Sequential
from keras.layers import Input, merge
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers.recurrent import LSTM
from keras.layers.merge import concatenate
from keras.regularizers import l2
import keras.backend as K

import variables

from custom_layers import Attention

Using Theano backend.


In [2]:
def read_embeddings(embeddingfn, dictfn):
    return np.load(embeddingfn), np.load(dictfn)

embeddingfn = 'cache/embedding_weights.npy'
dictfn = 'cache/ids_dictionary.pkl'
embedding_weights, ids_dict = read_embeddings(embeddingfn, dictfn)

In [3]:
def read_data(xtrainfn, ytrainfn, ztrainfn, qtrainfn):
    val_split = variables.VALIDATION_SPLIT
    
    X_all = np.load(xtrainfn)
    y_all = np.load(ytrainfn)
    title_all = np.load(ttrainfn)
    rating_all = np.load(rtrainfn)
    
    seed = 821  # np.random.randint(1234)
    print 'Seed {}'.format(seed)
    
    nb_training_samples = int((1.-val_split)*X_all.shape[0])
    np.random.seed(seed)
    train_set = np.random.choice(np.arange(X_all.shape[0]),size=nb_training_samples, replace=False)
    val_set = np.delete(np.arange(X_all.shape[0]),train_set)
       
    X_train = X_all[train_set,:]      
    y_train = y_all[train_set,:]
    title_train = title_all[train_set,:]
    rating_train = rating_all[train_set,:]

    X_val = X_all[val_set,:]    
    y_val = y_all[val_set,:]
    title_val = title_all[val_set,:]
    rating_val = rating_all[val_set,:]

    return X_train, y_train, title_train, rating_train, X_val, y_val, title_val, rating_val

In [4]:
xtrainfn = 'cache/train/data.npy'
ytrainfn = 'cache/train/labels.npy'
ttrainfn = 'cache/train/title.npy'
rtrainfn = 'cache/train/rating.npy'

(X_train, y_train, title_train, rating_train, 
    X_val, y_val, title_val, rating_val) = read_data(xtrainfn,ytrainfn, ttrainfn, rtrainfn)
print "Train set:{},{},{},{}\nVal set:{},{},{},{}".format(X_train.shape, y_train.shape, 
                                                          title_train.shape, rating_train.shape,
                                                          X_val.shape, y_val.shape, 
                                                          title_val.shape, rating_val.shape)

X_test = np.load('cache/test/data.npy')
title_test = np.load('cache/test/title.npy')
rating_test = np.load('cache/test/rating.npy')
print "Test set:{},{},{}".format(X_test.shape, title_test.shape, rating_test.shape)

Seed 821
Train set:(64000, 90),(64000, 1),(64000, 90),(64000, 1)
Val set:(16000, 90),(16000, 1),(16000, 90),(16000, 1)
Test set:(36395, 90),(36395, 90),(36395, 1)


In [5]:
def simple_lstm(embedding_weights, verbose=True):
    lstm_output = 64
    lstm_dropout_w = 0.3
    lstm_dropout_u = 0.3
    dropout = 0.6
    maxlen = variables.MAX_LEN

    nb_words, embedding_dim = embedding_weights.shape
    
    embedding = Embedding(input_dim=nb_words,             
                          input_length=maxlen,            
                          output_dim=embedding_dim,       
                          mask_zero=True,                 
                          weights=[embedding_weights],    
                          trainable=False,
                          name='embedding')
    lstm = LSTM(lstm_output,                         
                dropout=lstm_dropout_u,            
                recurrent_dropout=lstm_dropout_w,
                return_sequences=False,
                name='lstm')

    # Inputs
    data = Input(shape=(maxlen,), name='data')
    title =Input(shape=(maxlen,), name='title')
    rating = Input(shape=(1,), name='rating')
    # Embeddings
    data_emb = embedding(data)
    title_emb = embedding(title)
    # LSTM
    data_m = lstm(data_emb)
    title_m = lstm(title_emb)
    # Merge
    m = concatenate([data_m, title_m, rating])
    #Output
    s = Dropout(dropout)(m)
    s = Dense(1, name='dense')(s)
    s = Activation('sigmoid')(s)
    model = Model(inputs=[data,title,rating],outputs=[s])
    if verbose:
        model.summary()
    
    def schedule(epoch):
        if epoch >= 5:
            return 0.005
        else:
            return 0.005
            
    return model, schedule

In [6]:
def simple_bilstm(embedding_weights, verbose=True):
    lstm_output = 32
    lstm_dropout_w = 0.3
    lstm_dropout_u = 0.3
    dropout = 0.6
    maxlen = variables.MAX_LEN

    nb_words, embedding_dim = embedding_weights.shape
    
    embedding = Embedding(input_dim=nb_words,             
                          input_length=maxlen,            
                          output_dim=embedding_dim,       
                          mask_zero=True,                 
                          weights=[embedding_weights],    
                          trainable=False,
                          name='embedding')
    lstm = Bidirectional(LSTM(lstm_output,                         
                dropout=lstm_dropout_u,            
                recurrent_dropout=lstm_dropout_w,
                return_sequences=False,
                name='lstm'))

    # Inputs
    data = Input(shape=(maxlen,), name='data')
    title =Input(shape=(maxlen,), name='title')
    rating = Input(shape=(1,), name='rating')
    # Embeddings
    data_emb = embedding(data)
    title_emb = embedding(title)
    # LSTM
    data_m = lstm(data_emb)
    title_m = lstm(title_emb)
    # Merge
    m = concatenate([data_m, title_m, rating])
    #Output
    s = Dropout(dropout)(m)
    s = Dense(1, name='dense')(s)
    s = Activation('sigmoid')(s)
    model = Model(inputs=[data,title,rating],outputs=[s])
    if verbose:
        model.summary()
    
    def schedule(epoch):
        if epoch >= 5:
            return 0.005
        else:
            return 0.005
            
    return model, schedule

In [7]:
def attention_lstm(lstm_output, embedding_weights, verbose=True):
    lstm_dropout_w = 0.3
    lstm_dropout_u = 0.3
    dropout = 0.6
    maxlen = variables.MAX_LEN

    nb_words, embedding_dim = embedding_weights.shape
    
    embedding = Embedding(input_dim=nb_words,             
                          input_length=maxlen,            
                          output_dim=embedding_dim,       
                          mask_zero=True,                 
                          weights=[embedding_weights],    
                          trainable=False,
                          name='embedding')
    lstm = LSTM(lstm_output,                         
                dropout=lstm_dropout_u,            
                recurrent_dropout=lstm_dropout_w,
                return_sequences=True,
                name='lstm')
    attention = Attention()

    # Inputs
    data = Input(shape=(maxlen,), name='data')
    title =Input(shape=(maxlen,), name='title')
    rating = Input(shape=(1,), name='rating')
    # Embeddings
    data_emb = embedding(data)
    title_emb = embedding(title)
    # LSTM
    data_m = lstm(data_emb)
    data_m = attention(data_m)
    
    title_m = lstm(title_emb)
    title_m = attention(title_m)
    # Merge
    m = concatenate([data_m, title_m, rating])
    #Output
    s = Dropout(dropout)(m)
    s = Dense(1, name='dense')(s)
    s = Activation('sigmoid')(s)
    model = Model(inputs=[data,title,rating],outputs=[s])
    if verbose:
        model.summary()
    
    def schedule(epoch):
        if epoch >= 5:
            return 0.005
        else:
            return 0.005
            
    return model, schedule

In [8]:
class AUC(Callback):
    def __init__(self, validation_data=(), interval=10):
        super(Callback, self).__init__()
        self.X_val, self.title_val, self.rating_val, self.y_val = X_val, title_val, rating_val, y_val
    
    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict([self.X_val, self.title_val, self.rating_val], verbose=0)
        score = roc_auc_score(self.y_val, y_pred)
        print ' - val_auc - {:.4f}'.format(score)

In [9]:
def train(X_train,y_train,z_train,q_train,X_val,y_val,z_val,q_val,embedding_weights):
    nb_epochs = 100
    batch_size = 128
    patience = 3
    is_earlyStopping = True

    with_val = X_val.shape[0] > 0
    
    # build neural network
    model, schedule = simple_lstm(embedding_weights)
    #model, schedule = simple_bilstm(embedding_weights)
    #model, schedule = attention_lstm(64, embedding_weights)
    
    # Compile model
    optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    # AUC callback
    auc = AUC()
    
    # learning rate scheduler
    lr_scheduler = LearningRateScheduler(schedule)

    # stop training if the testing loss stops decreasing
    early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=patience)
    
    callbacks = [lr_scheduler, auc]
    if is_earlyStopping and with_val:
        callbacks += [early_stopping]
        
    # Train model
    try:
        model.fit([X_train,title_train,rating_train], y_train,
                  batch_size=batch_size,
                  epochs=nb_epochs,
                  validation_data=([X_val,title_val,rating_val], y_val),
                  callbacks=callbacks,
                  verbose=True)
    except KeyboardInterrupt:
        print '\nTraining interrupted by user.'
    
    return model

In [10]:
model = train(X_train,y_train,title_train,rating_train,X_val,y_val,title_val,rating_val,embedding_weights)

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
data (InputLayer)                (None, 90)            0                                            
____________________________________________________________________________________________________
title (InputLayer)               (None, 90)            0                                            
____________________________________________________________________________________________________
embedding (Embedding)            (None, 90, 200)       31112800                                     
____________________________________________________________________________________________________
lstm (LSTM)                      (None, 64)            67840                                        
___________________________________________________________________________________________

In [11]:
# Use in weighted lstm case

#model,_ = simple_lstm(embedding_weights)
#model,_ = attention_lstm(embedding_weights)
#model.load_weights('models/simple_lstm64.h5')

pred = model.predict([X_val,title_val,rating_val], batch_size=128, verbose=True)



In [12]:
val_auc = roc_auc_score(y_val, pred)
print "\nValidation AUC: {}\n".format(val_auc)


Validation AUC: 0.705733232823



# Ensemble of already trained models

In [14]:
model,_ = simple_lstm(embedding_weights, verbose=False)
model.load_weights('models/simple_lstm64.h5')
pred1 = model.predict([X_val,title_val,rating_val], batch_size=128, verbose=True)

model,_ = simple_bilstm(embedding_weights, verbose=False)
model.load_weights('models/simple_bilstm32.h5')
pred2 = model.predict([X_val,title_val,rating_val], batch_size=128, verbose=True)

model,_ = attention_lstm(32, embedding_weights, verbose=False)
model.load_weights('models/attention_lstm32.h5')
pred3 = model.predict([X_val,title_val,rating_val], batch_size=128, verbose=True)

model,_ = attention_lstm(64, embedding_weights, verbose=False)
model.load_weights('models/attention_lstm64.h5')
pred4 = model.predict([X_val,title_val,rating_val], batch_size=128, verbose=True)



In [25]:
ensemble = (1*pred1 + 
            1*pred2 + 
            1*pred3 + 
            1*pred4)/4

val_auc_1 = roc_auc_score(y_val, pred1)
val_auc_2 = roc_auc_score(y_val, pred2)
val_auc_3 = roc_auc_score(y_val, pred3)
val_auc_4 = roc_auc_score(y_val, pred4)
val_auc_ensemble = roc_auc_score(y_val, ensemble)
print "\nValidation AUC: {:.4f} - {:.4f} - {:.4f} - {:.4f} - {:.4f}\n".format(val_auc_1, val_auc_2, 
                                                                              val_auc_3, val_auc_4, 
                                                                              val_auc_ensemble)


Validation AUC: 0.7057 - 0.7095 - 0.7097 - 0.7123 - 0.7217



In [27]:
model,_ = simple_lstm(embedding_weights, verbose=False)
model.load_weights('models/simple_lstm64.h5')
test_pred1 = model.predict([X_test,title_test,rating_test], batch_size=128, verbose=True)

model,_ = simple_bilstm(embedding_weights, verbose=False)
model.load_weights('models/simple_bilstm32.h5')
test_pred2 = model.predict([X_test,title_test,rating_test], batch_size=128, verbose=True)

model,_ = attention_lstm(32, embedding_weights, verbose=False)
model.load_weights('models/attention_lstm32.h5')
test_pred3 = model.predict([X_test,title_test,rating_test], batch_size=128, verbose=True)

model,_ = attention_lstm(64, embedding_weights, verbose=False)
model.load_weights('models/attention_lstm64.h5')
test_pred4 = model.predict([X_test,title_test,rating_test], batch_size=128, verbose=True)

test_ensemble = (test_pred1 + test_pred2 + test_pred3 + test_pred4)/4

with open('cache/ensemble_output.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=';')
    csvwriter.writerow(['ID', 'Target'])
    for i in range(test_ensemble.size):
        csvwriter.writerow([i+80000, test_ensemble[i,0]])

