In [None]:
%matplotlib inline

import os
import csv
import codecs
import numpy as np
import pandas as pd
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import datetime
seed = 111
np.random.seed(seed)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Concatenate, LSTM, Lambda, Dropout, Multiply
from keras.layers import Conv1D, MaxPooling1D, Embedding, SpatialDropout1D, GRU
from keras.layers.merge import _Merge
from keras.models import Model
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras import backend as K
from keras.utils import plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
import sys
from keras_tqdm import TQDMNotebookCallback

In [None]:
BASE_DIR = 'input/'
GLOVE_DIR = 'WordEmbeddings/Glove/'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 80
MAX_NUM_WORDS = 500000
EMBEDDING_DIM = 300
STATE_DIM = 300
DROP = 0.2
NFOLDS = 10

In [None]:
print('Indexing word vectors.')
embeddings_index = {}
f = codecs.open(os.path.join(GLOVE_DIR, 'glove.840B.300d.txt'), encoding='utf-8')
for line in f:
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

In [None]:
print('Processing text dataset')
train = pd.read_csv(BASE_DIR + 'train.csv', encoding='utf-8')
test = pd.read_csv(BASE_DIR + 'test.csv', encoding='utf-8')
ids = train.id
texts_1 = train.question1.astype(np.str).tolist()
texts_2 = train.question2.astype(np.str).tolist()
labels = train.is_duplicate.tolist()  # list of label ids
print('Found %s texts.' % len(texts_1))

test_texts_1 = test.question1.astype(np.str).tolist()
test_texts_2 = test.question2.astype(np.str).tolist()
test_labels = [0] * len(test)  # list of label ids
print('Found %s texts.' % len(test_texts_1))

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)
sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_labels = np.array(test_labels)
del test_sequences_1
del test_sequences_2
del sequences_1
del sequences_2
import gc
gc.collect()

In [None]:
print('Preparing embedding matrix.')
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index))

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
count = 0
for word, i in word_index.items():
    if i >= num_words:
        count += 1
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be zeros
        embedding_matrix[i] = embedding_vector
        count += 1
print('Unknown word embeddings:', num_words - count)

In [None]:
# def euc_dist(x):
#     'Merge function: euclidean_distance(u,v)'
#     s = x[0] - x[1]
#     output = (s ** 2).sum(axis=1)
#     output = K.reshape(output, (output.shape[0],1))
#     return output

# def euc_dist_shape(input_shape):
#     'Merge output shape'
#     shape = list(input_shape)
#     outshape = (shape[0][0],1)
#     return tuple(outshape)

class Subtract(_Merge):
    """Layer that adds a list of inputs.
    It takes as input a list of tensors,
    all of the same shape, and returns
    a single tensor (also of the same shape).
    """
    def _merge_function(self, inputs):
        return K.square(inputs[0] - inputs[1])

embedding_layer = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH, trainable=False)

def siamese_architecture(seq_len, embed_len, state_len):
    inputs = Input(shape=(seq_len, embed_len))
    x = Bidirectional(GRU(units=state_len, dropout=DROP, recurrent_dropout=DROP,
                          implementation=2, return_sequences=True))(inputs)
    x = Bidirectional(GRU(units=state_len, dropout=DROP, recurrent_dropout=DROP,
                          implementation=2))(x)
    return Model(inputs=inputs, outputs=x)

In [None]:
# Model Architecture #
def create_model():
    siamese_arch = siamese_architecture(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, STATE_DIM)
    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = siamese_arch(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = siamese_arch(embedded_sequences_2)

#     merged = Concatenate()([x1, y1])
    merged_sub = Subtract()([x1, y1])
    merged_mult = Multiply()([x1, y1])
    merged_comb = Concatenate()([x1, y1, merged_sub, merged_mult])
    merged = BatchNormalization()(merged_comb)
    merged = Dense(512, activation='relu')(merged)
    merged = BatchNormalization()(merged)
    merged = Dense(128, activation='relu')(merged)
    # merged = Dropout(DROP)(merged)
    merged = BatchNormalization()(merged)
    preds = Dense(1, activation='sigmoid')(merged)
    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [None]:
model = create_model()
print(data_1.shape, data_2.shape, labels.shape)
model.summary()

In [None]:
plot_model(model, to_file='model.png')
img = mpimg.imread('model.png')
fig = plt.figure(figsize=(15, 15))
plt.imshow(img)

In [None]:
BATCH_SIZE = 512
skf = StratifiedKFold(n_splits=NFOLDS, shuffle=True, random_state=seed).split(data_1, labels)
test_preds = np.zeros((len(test_data_1), NFOLDS))
val_preds = np.zeros(len(data_1))
train_rnn_vals = np.zeros((len(data_1), STATE_DIM * 4), dtype=np.float32)
test_rnn_vals =  np.zeros((len(test_data_1), STATE_DIM * 4), dtype=np.float32)

X_test = [test_data_1, test_data_2]
now = datetime.datetime.now()

for i, (idx_train, idx_val) in enumerate(skf):
    print('Fold', i+1)
    X_train = [data_1[idx_train], data_2[idx_train]]
    X_val = [data_1[idx_val], data_2[idx_val]]

    y_train = labels[idx_train]
    y_val = labels[idx_val]

    callbacks = [TQDMNotebookCallback(),
                 EarlyStopping(patience=0),
                 ModelCheckpoint('weights_gru{}.hdf5'.format(i), save_best_only=True, save_weights_only=True)]
    
    model = create_model()
    model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=BATCH_SIZE,
              callbacks=callbacks, shuffle=True, verbose=0)
    model.load_weights('weights_gru{}.hdf5'.format(i))

#     feature_extractor = Model(inputs=[model.layers[0].input, model.layers[1].input],
#                               outputs=[model.layers[6].input[0], model.layers[6].input[1], model.layers[-1].output])
#     feature_extractor.predict_function = K.function(feature_extractor._feed_inputs + [K.learning_phase()],
#                                                     feature_extractor.outputs,
#                                                     updates=feature_extractor.state_updates,
#                                                     name='predict_function',
#                                                     **(getattr(feature_extractor, '_function_kwargs', {})))
    
#     outs = feature_extractor.predict(X_val, batch_size=BATCH_SIZE)
#     train_rnn_vals[idx_val, :600] = outs[0]
#     train_rnn_vals[idx_val, 600:] = outs[1]
    val_preds[idx_val] = model.predict(X_val, batch_size=BATCH_SIZE)
#     outs = feature_extractor.predict(X_test, batch_size=BATCH_SIZE)
#     test_rnn_vals[:, :600] = outs[0]
#     test_rnn_vals[:, 600:] = outs[1]
    test_preds[:, i] = model.predict(X_test, batch_size=BATCH_SIZE).reshape((-1,))
#     pd.to_pickle(test_rnn_vals, 'GRU_hidden_states_test_{}_{:%Y%m%d_%H%M}.pkl'.format(i+1, now))
#     del model, feature_extractor, outs, test_rnn_vals
#     gc.collect()

In [None]:
a = 0.165 / 0.369191399096
b = (1 - 0.165) / (1 - 0.369191399096)
def pred_transform(preds):
    return a * preds / (a * preds + b * (1 - preds))

In [None]:
from sklearn.metrics import log_loss
test_df = pd.DataFrame({"test_id": np.arange(len(test_data_1)), "is_duplicate":test_preds.mean(1)})
test_preds_mod = pred_transform(test_preds)
test_df_mod = pd.DataFrame({"test_id": np.arange(len(test_data_1)), "is_duplicate":test_preds_mod.mean(1)})
val_df = pd.DataFrame({"id": np.arange(len(data_1)), "is_duplicate": val_preds})

loss = log_loss(train.is_duplicate, val_preds)
print('Log Loss:', loss)

# now = datetime.datetime.now()

test_pred_filename = "model_out/test_preds_gru_{:.4f}_{:%Y%m%d_%H%M}.csv.gz".format(loss, now)
test_df.to_csv(test_pred_filename, index=False, compression='gzip')

test_pred_mod_filename = "model_out/test_preds_gru_{:.4f}_{:%Y%m%d_%H%M}_mod.csv.gz".format(loss, now)
test_df_mod.to_csv(test_pred_mod_filename, index=False, compression='gzip')

val_pred_filename = "model_out/train_preds_gru_{:.4f}_{:%Y%m%d_%H%M}.csv.gz".format(loss, now)
val_df.to_csv(val_pred_filename, index=False, compression='gzip')

# pd.to_pickle(train_rnn_vals, 'GRU_hidden_states_train_{:%Y%m%d_%H%M}.pkl'.format(now))