https://www.kaggle.com/lystdo/quora-question-pairs/lstm-with-word2vec-embeddings

In [None]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import re
import csv
import codecs
import numpy as np
import pandas as pd
import gc
import sys


from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

import gensim
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.advanced_activations import PReLU
from sklearn.model_selection import train_test_split

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten, Concatenate, LSTM, Lambda, Dropout, Multiply
from keras.layers import Conv1D, MaxPooling1D, Embedding, SpatialDropout1D, GRU
from keras.layers.merge import _Merge
from keras.models import Model
from keras.layers.wrappers import TimeDistributed, Bidirectional
from keras.layers.normalization import BatchNormalization
from keras import backend as K
from keras.utils import plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
def lstm_model(ncols):
    embedding_layer = Embedding(nb_words,
            embedding_dim,
            weights=[word_embedding_matrix],
            input_length=ncols,
            trainable=False)
    
    lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,
                     go_backwards = False, implementation = 2)

    sequence_1_input = Input(shape=(ncols,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(ncols,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    merged = concatenate([x1, y1])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(num_dense, activation=act)(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
    return model


def deep_lstm_model():
    embedding_layer = Embedding(nb_words,
            embedding_dim,
            weights=[word_embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)
    
    lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,
                     go_backwards = False, implementation = 2, return_sequences = True)
    lstm_layer2 = LSTM(96, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,
                     go_backwards = False, implementation = 2)

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)
    x2 = lstm_layer2(x1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)
    y2 = lstm_layer2(y1)

    merged = concatenate([x2, y2])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(num_dense, activation=act)(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
    return model


def merged_lstm():
    embedding_layer = Embedding(nb_words,
            embedding_dim,
            weights=[word_embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)
    
    lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,
                     go_backwards = False, implementation = 2)

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    dense_input = Input(shape = (ncols,))
    d = Dense(256, kernel_initializer = 'he_normal')(dense_input)
    d = PReLU()(d)
    d = BatchNormalization()(d)
    d = Dropout(0.4)(d)
    
    d2 = Dense(512, kernel_initializer = 'he_normal')(d)
    d2 = PReLU()(d2)
    d2 = BatchNormalization()(d2)
    d2 = Dropout(0.2)(d2)
    
    d3 = Dense(512, kernel_initializer = 'he_normal')(d2)
    d3 = PReLU()(d3)
    d3 = BatchNormalization()(d3)
    d3 = Dropout(0.2)(d3)
    
    merged = concatenate([x1, y1, d3])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(num_dense)(merged)
    merged = PReLU()(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(1, activation='sigmoid')(merged)
    
    model = Model(inputs=[sequence_1_input, sequence_2_input, dense_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model


class Subtract(_Merge):
    """Layer that adds a list of inputs.
    It takes as input a list of tensors,
    all of the same shape, and returns
    a single tensor (also of the same shape).
    """
    def _merge_function(self, inputs):
        return K.square(inputs[0] - inputs[1])

def siamese_architecture(seq_len, embed_len, state_len):
    inputs = Input(shape=(seq_len, embed_len))
    x = Bidirectional(GRU(units=state_len, dropout=rate_drop_dense, recurrent_dropout=rate_drop_lstm,
                          implementation=2, return_sequences=True))(inputs)
    x = Bidirectional(GRU(units=state_len, dropout=rate_drop_dense, recurrent_dropout=rate_drop_lstm,
                          implementation=2))(x)
    return Model(inputs=inputs, outputs=x)

def create_model():
    embedding_layer = Embedding(nb_words, 300, weights=[word_embedding_matrix],
                            input_length=170, trainable=False)
    
    siamese_arch = siamese_architecture(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM, num_lstm)
    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = siamese_arch(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = siamese_arch(embedded_sequences_2)

#     merged = Concatenate()([x1, y1])
    merged_sub = Subtract()([x1, y1])
    merged_mult = Multiply()([x1, y1])
    merged_comb = Concatenate()([x1, y1, merged_sub, merged_mult])
    merged = BatchNormalization()(merged_comb)
    merged = Dense(512, activation='relu')(merged)
    merged = BatchNormalization()(merged)
    merged = Dense(128, activation='relu')(merged)
    # merged = Dropout(DROP)(merged)
    merged = BatchNormalization()(merged)
    preds = Dense(1, activation='sigmoid')(merged)
    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model

In [None]:
def create_mergevalidset(data_1, data_2, datafeats, labels):
    np.random.seed(1234)
    perm = np.random.permutation(len(data_1))
    idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
    idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]
    
    data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
    data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
    labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
    dataf_train = np.vstack((datafeats[idx_train], datafeats[idx_train]))
    
    data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
    data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
    labels_val = np.concatenate((labels[idx_val], labels[idx_val]))
    dataf_val = np.vstack((datafeats[idx_val], datafeats[idx_val]))
    return data_1_train, data_2_train, dataf_train, labels_train, data_1_val, data_2_val, dataf_val, labels_val


def create_stratified_split(data_1, data_2, labels):
    data1_tr, data1_val, y1_tr, y1_val = train_test_split(data_1, labels, stratify = labels,
                                                        test_size = 0.2, random_state = 111)
    data2_tr, data2_val, y2_tr, y2_val = train_test_split(data_2, labels, stratify = labels,
                                                        test_size = 0.2, random_state = 111)
    return data1_tr, data2_tr, y1_tr, data2_tr, data2_val, y1_val

In [None]:
BASE_DIR = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
CHECK_DIR = BASE_DIR + '../scripts/models/checkpoints/'

MAX_SEQUENCE_LENGTH = 128
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

embedding_dim = 300
nb_words = 120594

In [None]:
src_train = '../features/df_train_spacylemmat_fullclean.csv'
src_test = '../features/df_test_spacylemmat_fullclean.csv'

df_train = pd.read_csv(src_train)
df_test = pd.read_csv(src_test)

df_train.fillna('NULL', inplace = True)
df_test.fillna('NULL', inplace = True)

test_ids = df_test['test_id']
test_ids = np.array(test_ids)

data_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/transformed/keras_tokenizer/'
word_embedding_matrix = np.load(data_src + 'embedding_matrix.npy')

q_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/NER/'
feats_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/features/uncleaned/'

q1 = np.load(q_src + 'q1train_NER_128len.npy')
q2 = np.load(q_src + 'q2train_NER_128len.npy')

X = pd.read_pickle('Xtrain_500bestCols.pkl')
y = pd.read_csv(feats_src + '/the_1owl/owl_train.csv')['is_duplicate']

del df_train, df_test
gc.collect()

In [None]:
data_1_train, data_2_train, dataf_train, labels_train, data_1_val, data_2_val, dataf_val, labels_val = create_mergevalidset(q1, q2, X.values, y)
del q1, q2, X, y
gc.collect()

In [None]:
re_weight = True
weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None
    
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25
act = 'relu'
ncols = dataf_train.shape[1] 

STAMP = 'BiGRU_fred_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

print('Model stamp:', STAMP)
early_stopping = EarlyStopping(monitor='val_loss', patience = 2)
check_path = CHECK_DIR + STAMP + '.h5'
model_checkpoint = ModelCheckpoint(check_path, save_best_only=True, save_weights_only=True)

In [None]:
model = create_model()
hist = model.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
        epochs=200, batch_size=512, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

In [None]:
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

print('Start making the submission before fine-tuning')
preds = model.predict([test_data_1, test_data_2], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=8192, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

In [None]:
sub1 = pd.read_csv('0.2615_lstm_223_141_0.31_0.20.csv')
sub2 = pd.read_csv('0.2647_lstm_198_127_0.20_0.19.csv')

sub_avg = sub1.copy()
sub_avg['is_duplicate'] = (sub1['is_duplicate'] + sub2['is_duplicate'] ) / 2
sub_avg['test_id'] = sub1['test_id']
sub_avg.to_csv('submission_first_two_avg.csv', index = False)