https://www.kaggle.com/lystdo/quora-question-pairs/lstm-with-word2vec-embeddings

In [1]:
import os
os.environ['PYTHONIOENCODING']='=utf8'
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"

import re
import csv
import codecs
import numpy as np
import pandas as pd
import gc

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

import gensim
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers.advanced_activations import PReLU
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
BASE_DIR = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
CHECK_DIR = BASE_DIR + '../scripts/models/checkpoints/'

MAX_SEQUENCE_LENGTH = 36
MAX_NB_WORDS = 200000

embedding_dim = 300
nb_words = 120593

In [3]:
def get_train():
    keras_q1 = np.load('../../data/transformed/keras_tokenizer/train_q1_transformed.npy')
    keras_q2 = np.load('../../data/transformed/keras_tokenizer/train_q2_transformed.npy')
    xgb_feats = pd.read_csv('../../data/features/the_1owl/owl_train.csv')
    abhishek_feats = pd.read_csv('../../data/features/abhishek/train_features.csv',
                              encoding = 'ISO-8859-1').iloc[:, 2:]
    text_feats = pd.read_csv('../../data/features/other_features/text_features_train.csv',
                            encoding = 'ISO-8859-1')
    img_feats = pd.read_csv('../../data/features/other_features/img_features_train.csv')
    srk_feats = pd.read_csv('../../data/features/srk/SRK_grams_features_train.csv')

    xgb_feats.drop(['z_len1', 'z_len2', 'z_word_len1', 'z_word_len2'], axis = 1, inplace = True)
    xgb_feats = xgb_feats.iloc[:, 8:]
    
    X_train2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, text_feats, img_feats], axis = 1)
    #X_train2 = np.concatenate([keras_q1, keras_q2, xgb_feats, abhishek_feats, text_feats], axis = 1)
    return X_train2

def concat_feats(data1, data2):
    datafeats = get_train()
    full1 = np.concatenate([data1, datafeats], axis = 1)
    full2 = np.concatenate([data2, datafeats], axis = 1)
    del data1, data2, datafeats
    gc.collect()
    return full1, full2

In [4]:
def lstm_model(ncols):
    embedding_layer = Embedding(nb_words + 1,
            embedding_dim,
            weights=[word_embedding_matrix],
            input_length=ncols,
            trainable=False)
    
    lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,
                     go_backwards = False, implementation = 2)

    sequence_1_input = Input(shape=(ncols,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(ncols,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    merged = concatenate([x1, y1])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(num_dense, activation=act)(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
    return model


def deep_lstm_model():
    embedding_layer = Embedding(nb_words + 1,
            embedding_dim,
            weights=[word_embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)
    
    lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,
                     go_backwards = False, implementation = 2, return_sequences = True)
    lstm_layer2 = LSTM(96, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,
                     go_backwards = False, implementation = 2)

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)
    x2 = lstm_layer2(x1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)
    y2 = lstm_layer2(y1)

    merged = concatenate([x2, y2])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(num_dense, activation=act)(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='nadam', metrics=['acc'])
    return model


def merged_lstm():
    embedding_layer = Embedding(nb_words + 1,
            embedding_dim,
            weights=[word_embedding_matrix],
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)
    
    lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,
                     go_backwards = False, implementation = 2)

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    dense_input = Input(shape = (ncols,))
    d = Dense(512, kernel_initializer = 'he_normal')(dense_input)
    d = PReLU()(d)
    d = BatchNormalization()(d)
    d = Dropout(0.4)(d)
    
    d2 = Dense(256, kernel_initializer = 'he_normal')(d)
    d2 = PReLU()(d2)
    d2 = BatchNormalization()(d2)
    d2 = Dropout(0.4)(d2)
    
    d3 = Dense(256, kernel_initializer = 'he_normal')(d2)
    d3 = PReLU()(d3)
    d3 = BatchNormalization()(d3)
    d3 = Dropout(0.4)(d3)
    
    merged = concatenate([x1, y1, d3])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(num_dense)(merged)
    merged = PReLU()(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(1, activation='sigmoid')(merged)
    
    model = Model(inputs=[sequence_1_input, sequence_2_input, dense_input], outputs=preds)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
    return model


In [5]:
def create_validset(data_1, data_2, datafeats, labels):
    np.random.seed(1234)
    perm = np.random.permutation(len(data_1))
    idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
    idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]
    
    data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
    data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
    labels_train = np.concatenate((labels[idx_train], labels[idx_train]))
    dataf_train = np.vstack((datafeats[idx_train], datafeats[idx_train]))
    
    data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
    data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
    labels_val = np.concatenate((labels[idx_val], labels[idx_val]))
    dataf_val = np.vstack((datafeats[idx_val], datafeats[idx_val]))
    return data_1_train, data_2_train, dataf_train, labels_train, data_1_val, data_2_val, dataf_val, labels_val


def create_validset2(data_1, data_2, labels):
    np.random.seed(1234)
    perm = np.random.permutation(len(data_1))
    idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
    idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

    data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
    data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
    labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

    data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
    data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
    labels_val = np.concatenate((labels[idx_val], labels[idx_val]))
    return data_1_train, data_2_train, labels_train, data_1_val, data_2_val, labels_val


In [5]:
data_src = '/media/w/1c392724-ecf3-4615-8f3c-79368ec36380/DS Projects/Kaggle/Quora/data/transformed/keras_tokenizer/'

data_1 = np.load(data_src + 'train_q1_transformed.npy')
data_2 = np.load(data_src + 'train_q2_transformed.npy')
labels = np.load(data_src + 'train_labels.npy')
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = np.load(data_src + 'test_q1_transformed.npy')
test_data_2 = np.load(data_src + 'test_q2_transformed.npy')
test_ids = np.load(data_src + 'test_ids.npy')
print('Shape of test data tensor:', test_data_1.shape)

word_embedding_matrix = np.load(data_src + 'embedding_matrix.npy')

Shape of data tensor: (404290, 36)
Shape of label tensor: (404290,)


In [7]:
def create_validset3(data_1, data_2, labels):
    data1_tr, data1_val, y1_tr, y1_val = train_test_split(data_1, labels, stratify = labels,
                                                        test_size = 0.2, random_state = 111)
    data2_tr, data2_val, y2_tr, y2_val = train_test_split(data_2, labels, stratify = labels,
                                                        test_size = 0.2, random_state = 111)

    data_1_train = np.vstack((data1_tr, data2_tr))
    data_2_train = np.vstack((data2_tr, data1_tr))
    labels_train = np.concatenate((y1_tr, y2_tr))

    data_1_val = np.vstack((data1_val, data2_val))
    data_2_val = np.vstack((data2_val, data1_val))
    labels_val = np.concatenate((y1_val, y2_val))
    return data_1_train, data_2_train, labels_train, data_1_val, data_2_val, labels_val

In [8]:
datafeats = get_train()
data_1_train, data_2_train, dataf_train, labels_train, data_1_val, data_2_val, dataf_val, labels_val = create_validset(data_1, data_2, datafeats, labels)
del data_1, data_2, labels, datafeats
gc.collect()

96

In [11]:
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set
weight_val = np.ones(len(labels_val))
if re_weight:
    weight_val *= 0.472001959
    weight_val[labels_val==0] = 1.309028344

if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None
    
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25
act = 'relu'
ncols = dataf_train.shape[1] 

STAMP = 'lstm_mergedfullfeats_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

print('Model stamp:', STAMP)
early_stopping = EarlyStopping(monitor='val_loss', patience = 5)
check_path = CHECK_DIR + STAMP + '.h5'
model_checkpoint = ModelCheckpoint(check_path, save_best_only=True, save_weights_only=True)

Model stamp: lstm_mergedfullfeats_220_141_0.36_0.27


val_loss: 0.2488 - val_acc: 0.8175 for standard model concatenated with Abhishek's features

In [12]:
model = merged_lstm()
hist = model.fit([data_1_train, data_2_train, dataf_train], labels_train, \
        validation_data=([data_1_val, data_2_val, dataf_val], labels_val, weight_val), \
        epochs=200, batch_size=1024, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200


In [None]:
model = lstm_model(ncols)
hist = model.fit([data_1_train, data_2_train], labels_train, \iiiiiiiii
        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

In [None]:
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

print('Start making the submission before fine-tuning')
preds = model.predict([test_data_1, test_data_2], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=8192, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

In [None]:
sub1 = pd.read_csv('0.2615_lstm_223_141_0.31_0.20.csv')
sub2 = pd.read_csv('0.2647_lstm_198_127_0.20_0.19.csv')

sub_avg = sub1.copy()
sub_avg['is_duplicate'] = (sub1['is_duplicate'] + sub2['is_duplicate'] ) / 2
sub_avg['test_id'] = sub1['test_id']
sub_avg.to_csv('submission_first_two_avg.csv', index = False)