In [None]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import keras.backend as K

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

In [None]:
train_df.head(1)

In [None]:
stop = set(stopwords.words('english'))
def remove_stops(sentence):
    filtered = list()
    for w in sentence.split(" "):
        if w not in stop:
            filtered.append(w)
    return " ".join(w)
    
def preprocess_questions(questions,
                        remove_stop_words=True):
    questions = questions.str.lower()
    questions = questions.fillna("_na_")
    if remove_stop_words:
        questions = questions.apply(remove_stops)
    return questions
    

In [None]:
train_df['question_text'] = preprocess_questions(train_df['question_text'], remove_stop_words=False)
test_df['question_text'] = preprocess_questions(test_df['question_text'], remove_stop_words=False)

In [None]:
## split to train and val
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

## some config values 
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

train_X = train_df["question_text"].values
val_X = val_df["question_text"].values
test_X = test_df["question_text"].values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [None]:
dev_size = None

In [None]:
## Comment this cell for the full run
# dev_size = 500
# train_X = train_X[:dev_size]
# val_X = val_X[:dev_size]
# test_X = test_X[:dev_size]
# train_y = train_y[:dev_size]
# val_y = val_y[:dev_size]

In [None]:
def f1_score(y_true, y_pred):
    c1 = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    c2 = K.sum(K.round(K.clip(y_pred, 0, 1)))
    c3 = K.sum(K.round(K.clip(y_true, 0, 1)))

    if c3 == 0:
        return 0

    precision = c1 / c2
    recall = c1 / c3

    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score 

In [None]:
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
inp = Input(shape=(maxlen,))
x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model_glove = Model(inputs=inp, outputs=x)
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_glove.summary())

In [None]:
model_glove.fit(train_X, train_y, batch_size=512, epochs=4, validation_data=(val_X, val_y))

In [None]:
def find_best_threshold(preds, y):
    best_thresh = -100
    best_score = -100
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        score =  metrics.f1_score(val_y, (preds>thresh).astype(int))
        if score > best_score:
            best_thresh = thresh
        print("F1 score at threshold {0} is {1}".format(thresh,score))
    return best_thresh

In [None]:
pred_glove_val_y = model_glove.predict([val_X], batch_size=1024, verbose=1)
# thresh = find_best_threshold(pred_glove_val_y, val_y)

Results seem to be better than the model without pretrained embeddings.

In [None]:
pred_glove_test_y = model_glove.predict([test_X], batch_size=1024, verbose=1)

In [None]:
del word_index, embeddings_index, all_embs, embedding_matrix, inp, x
import gc; gc.collect()
time.sleep(10)

**Wiki News FastText Embeddings:**

Now let us use the FastText embeddings trained on Wiki News corpus in place of Glove embeddings and rebuild the model.

In [None]:
EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model_fasttext = Model(inputs=inp, outputs=x)
model_fasttext.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_score])

In [None]:
model_fasttext.fit(train_X, train_y, batch_size=512, epochs=4, validation_data=(val_X, val_y))

In [None]:
pred_fasttext_val_y = model_fasttext.predict([val_X], batch_size=1024, verbose=1)
thresh = find_best_threshold(pred_fasttext_val_y, val_y)

In [None]:
pred_fasttext_test_y = model_fasttext.predict([test_X], batch_size=1024, verbose=1)

In [None]:
del word_index, embeddings_index, all_embs, embedding_matrix, inp, x
import gc; gc.collect()
time.sleep(10)

**Paragram Embeddings:**

In this section, we can use the paragram embeddings and build the model and make predictions.

In [None]:
EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
inp = Input(shape=(maxlen,))
x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
x = GlobalMaxPool1D()(x)
x = Dense(16, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model_paragram = Model(inputs=inp, outputs=x)
model_paragram.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', f1_score])

In [None]:
model_paragram.fit(train_X, train_y, batch_size=512, epochs=4, validation_data=(val_X, val_y))

In [None]:
pred_paragram_val_y = model_paragram.predict([val_X], batch_size=1024, verbose=1)
thresh = find_best_threshold(pred_paragram_val_y, val_y)

In [None]:
pred_paragram_test_y = model_paragram.predict([test_X], batch_size=1024, verbose=1)

In [None]:
del word_index, embeddings_index, all_embs, embedding_matrix, inp, x
import gc; gc.collect()
time.sleep(10)

**Observations:**
 * Overall pretrained embeddings seem to give better results comapred to non-pretrained model. 
 * The performance of the different pretrained embeddings are almost similar.
 
**Final Blend:**

Though the results of the models with different pre-trained embeddings are similar, there is a good chance that they might capture different type of information from the data. So let us do a blend of these three models by averaging their predictions.

In [None]:
pred_val_y = 0.33*pred_glove_val_y + 0.33*pred_fasttext_val_y + 0.34*pred_paragram_val_y 

The result seems to better than individual pre-trained models and so we let us create a submission file using this model blend.

In [None]:
pred_test_y = 0.33*pred_glove_test_y + 0.33*pred_fasttext_test_y + 0.34*pred_paragram_test_y
thresh = find_best_threshold(pred_val_y, val_y)
pred_test_y = (pred_test_y>thresh).astype(int)

In [None]:
full_X = np.vstack([train_X, val_X, test_X])
full_y = np.vstack([train_y.reshape((len(train_y), 1)), val_y.reshape((len(val_y), 1)), pred_test_y])

In [None]:
model_glove.fit(test_X, pred_test_y, batch_size=512, epochs=4, validation_data=(val_X, val_y))
pred_glove_test_y_pseudo = model_glove.predict([test_X], batch_size=1024, verbose=1)

In [None]:
model_fasttext.fit(test_X, pred_test_y, batch_size=512, epochs=4, validation_data=(val_X, val_y))
pred_fasttext_test_y_pseudo = model_fasttext.predict([test_X], batch_size=1024, verbose=1)

In [None]:
model_paragram.fit(test_X, pred_test_y, batch_size=512, epochs=4, validation_data=(val_X, val_y))
pred_paragram_test_y_pseudo = model_paragram.predict([test_X], batch_size=1024, verbose=1)

In [None]:
pred_test_y_pseudo = 0.33*pred_glove_test_y_pseudo \
    + 0.33*pred_fasttext_test_y_pseudo \
    + 0.34*pred_paragram_test_y_pseudo 
thresh = find_best_threshold(pred_val_y, val_y)

In [None]:
if dev_size:
    test_df = test_df.head(dev_size)
pred_test_y_pseudo = (pred_test_y_pseudo>thresh).astype(int)
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y_pseudo
out_df.to_csv("submission.csv", index=False)