In [None]:
import csv
import variables
from sklearn.metrics import roc_auc_score
import numpy as np
np.set_printoptions(precision=2, linewidth=130)

import os
os.environ["THEANO_FLAGS"] = ("device=cpu,floatX=float32,gpuarray.preallocate=1")

from keras.callbacks import Callback, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau
from keras.optimizers import Adam

from models import cbow, lstm, bilstm, attention_lstm
from models import save_model_without_embedding

In [None]:
def read_embeddings(embeddingfn):
    return np.load(embeddingfn)

embeddingfn = 'cache/embedding_weights.npy'
embedding_weights = read_embeddings(embeddingfn)

In [None]:
data = {'train':{}, 'val':{}, 'test':{}}

# Loading data from the training set
X_text_vec = np.load('cache/train/text_vec.npy')
X_text_ids = np.load('cache/train/text_ids.npy')
X_titl_vec = np.load('cache/train/titl_vec.npy')
X_titl_ids = np.load('cache/train/titl_ids.npy')
X_ratg = np.load('cache/train/ratg.npy')
y = np.load('cache/train/labels.npy')

# Creating train and val splits
val_split = variables.VALIDATION_SPLIT
size = X_text_vec.shape[0]

seed = 821  # np.random.randint(1234)
print 'Seed {}'.format(seed)

nb_training_samples = int((1.-val_split)*size)
np.random.seed(seed)
train_set = np.random.choice(np.arange(size),size=nb_training_samples, replace=False)
val_set = np.delete(np.arange(size),train_set)

# Adding splits to the dictionary
data['train']['text_vec'] = X_text_vec[train_set,:]
data['train']['text_ids'] = X_text_ids[train_set,:]
data['train']['titl_vec'] = X_titl_vec[train_set,:]
data['train']['titl_ids'] = X_titl_ids[train_set,:]
data['train']['ratg'] = X_ratg[train_set,:]
data['train']['y'] = y[train_set,:]

data['val']['text_vec'] = X_text_vec[val_set,:]
data['val']['text_ids'] = X_text_ids[val_set,:]
data['val']['titl_vec'] = X_titl_vec[val_set,:]
data['val']['titl_ids'] = X_titl_ids[val_set,:]
data['val']['ratg'] = X_ratg[val_set,:]
data['val']['y'] = y[val_set,:]

data['test']['text_vec'] = np.load('cache/test/text_vec.npy')
data['test']['text_ids'] = np.load('cache/test/text_ids.npy')
data['test']['titl_vec'] = np.load('cache/test/titl_vec.npy')
data['test']['titl_ids'] = np.load('cache/test/titl_ids.npy')
data['test']['ratg'] = np.load('cache/test/ratg.npy')

print 'Train:', data['train']['text_vec'].shape
print 'Val:', data['val']['text_vec'].shape
print 'Val:', data['test']['text_vec'].shape

In [None]:
class AUC(Callback):
    def __init__(self, data_val):
        super(Callback, self).__init__()
        self.data = data_val[0]
        self.labl = data_val[1]
    
    def on_epoch_end(self, epoch, logs=None):
        y_pred = self.model.predict(self.data, verbose=0)
        score = roc_auc_score(self.labl, y_pred)
        print ' - val_auc - {:.4f}'.format(score)

In [None]:
def train(model, data_train, data_val):
    nb_epochs = 100
    batch_size = 128
    is_earlyStopping = True
    
    # Compile model
    optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0, clipnorm=1.)
    model.compile(loss='binary_crossentropy', optimizer=optimizer)
    
    # Callbacks
    auc = AUC(data_val)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, verbose=1, mode='auto', epsilon=0.0015)
    early_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)
    
    callbacks = [reduce_lr, auc]
    if is_earlyStopping:
        callbacks += [early_stopping]
        
    # Train model
    try:
        model.fit(data_train[0], data_train[1],
                  batch_size=batch_size,
                  epochs=nb_epochs,
                  validation_data=(data_val[0], data_val[1]),
                  callbacks=callbacks,
                  verbose=True)
    except KeyboardInterrupt:
        print '\nTraining interrupted by user.'
    
    return model

In [None]:
#model, data_train, data_val, data_test = cbow(data, embedding_weights, verbose=True)
#model, data_train, data_val, data_test = lstm(data, embedding_weights, verbose=True)
#model, data_train, data_val, data_test = bilstm(data, embedding_weights, verbose=True)
model, data_train, data_val, data_test = attention_lstm(data, embedding_weights, verbose=True)

In [None]:
model = train(model, data_train, data_val)

In [None]:
pred = model.predict(data_val[0], batch_size=128, verbose=True)
val_auc = roc_auc_score(data_val[1], pred)
print "\nValidation AUC: {}".format(val_auc)

# Output prediction file

In [None]:
test_pred = model.predict(data_test, batch_size=128, verbose=True)

In [None]:
with open('output/output.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=';')
    csvwriter.writerow(['ID', 'Target'])
    for i in range(test_pred.size):
        if np.amax(data['test']['text_vec'][i,:]) == 0:
            # If there are no embeddings for the review, it probably
            # means that the orignial message is not informative
            csvwriter.writerow([i+80000, 0.])
        else:
            csvwriter.writerow([i+80000, test_pred[i,0]])

# Ensemble of already trained models 

In [None]:
archs = [cbow, lstm, bilstm]
weights = ['models/cbow_512.h5', 'models/lstm_CTR_64.h5', 'models/bilstm_CTR_32.h5']

In [None]:
preds = []
for idx,arch in enumerate(archs):
    model, _, data_val, _ = arch(data, embedding_weights, verbose=False)
    model.load_weights(weights[idx], by_name=True)
    preds.append(model.predict(data_val[0], batch_size=128, verbose=True))

In [None]:
def find_best_ensemble(preds):
    samples = 1000
    grid = np.random.uniform(size=(samples,len(preds)))
    grid = grid / np.sum(grid, axis=1)[:,np.newaxis]
    
    results = np.zeros((samples,))
    
    for i in range(samples):
        ensemble = np.zeros(preds[0].shape)
        for k in range(len(preds)):
            ensemble += grid[i,k] * preds[k]
        results[i] = roc_auc_score(data['val']['y'], ensemble)
    print 'Best ensemble AUC: {:.6f} with weights'.format(np.amax(results)), grid[np.argmax(results),:]
    return grid[np.argmax(results),:]

ensemble_weights = find_best_ensemble(preds)

In [None]:
test_preds = []
for idx,arch in enumerate(archs):
    model, _, _, data_test = arch(data, embedding_weights, verbose=False)
    model.load_weights(weights[idx], by_name=True)
    test_preds.append(model.predict(data_test, batch_size=128, verbose=True))

test_ensemble = np.zeros(test_preds[0].shape)
for k in range(len(preds)):
    test_ensemble += ensemble_weights[k] * test_preds[k]

with open('output/ensemble_output.csv', 'wb') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=';')
    csvwriter.writerow(['ID', 'Target'])
    for i in range(test_ensemble.size):
        if np.amax(data['test']['text_vec'][i,:]) == 0:
            # If there are no embeddings for the review, it probably
            # means that the orignial message is not informative
            csvwriter.writerow([i+80000, 0.])
        else:
            csvwriter.writerow([i+80000, test_ensemble[i,0]])