In [1]:
import os
import time
import numpy as np # linear algebra                                                                                                                                                                         
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)                                                                                                                                      
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers


import matplotlib.pylab as plt

Using TensorFlow backend.


**Load data**

In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


**Setup cross validation and training dataset**

In [3]:
# Cross validation - create training and testing dataset
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2018)

**Preprocess the data**

In [4]:
# Preprocess the data
## some config values                                                                                                                                                                                       
embed_size = 300 # how big is each word vector                                                                                                                                                              
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)                                                                                                                      
maxlen = 20 # max number of words in a question to use                                                                                                                                                     

## fill up the missing values                                                                                                                                                                               
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences                                                                                                                                                                                   
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences                                                                                                                                                                                        
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values                                                                                                                                                                                    
train_y = train_df['target'].values
val_y = val_df['target'].values

**Build your model**

In [5]:
EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
        
def get_model():
    inp = Input(shape=(maxlen,))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(16, activation="relu")(x)
    x = Dropout(0.1)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


In [6]:
models = {}
# Initialize three models:
# 0. Embedding layer always trainable
# 1. Embedding layer trainable at beginning of training; frozen at end
# 2. Embedding layer trainable at end of training; frozen at beginning
models['Embedding_Always_Trainable'] = get_model()
models['Embedding_Frozen_At_End'] = get_model()
models['Embedding_Frozen_At_Start'] = get_model()


In [7]:
def set_embedding_trainable(model, trainable):
    for layer in model.layers:
        if layer.name.startswith('embedding'):
            layer.trainable = trainable
            print("Set '%s' layer trainable=%s"%(layer.name, str(trainable)))
            break
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [8]:
# Set Embedding layer 'Freeze_Embedding_At_Start' to trainable=False
set_embedding_trainable(models['Embedding_Frozen_At_Start'], False)


Set 'embedding_3' layer trainable=False


**Train  models**

In [9]:
for key in models.keys():
    print("Training model: %s..."%key)
    model = models.get(key)
    model.fit(train_X, train_y, batch_size=3000, epochs=2, validation_data=(val_X, val_y))

Training model: Embedding_Always_Trainable...
Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2
Training model: Embedding_Frozen_At_End...
Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2
Training model: Embedding_Frozen_At_Start...
Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


In [10]:
# Set Embedding layer of 'Freeze_Embedding_At_End' trainable=False
set_embedding_trainable(models['Embedding_Frozen_At_End'], False)

# Set Embedding layer of 'Freeze_Embedding_At_Start' model trainable=True
set_embedding_trainable(models['Embedding_Frozen_At_Start'], True)


Set 'embedding_2' layer trainable=False
Set 'embedding_3' layer trainable=True


In [11]:
# Train for several more epochs
for key in models.keys():
    print("Continue training model: %s..."%key)
    model = models.get(key)
    model.fit(train_X, train_y, batch_size=3000, epochs=2, validation_data=(val_X, val_y))

Continue training model: Embedding_Always_Trainable...
Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2
Continue training model: Embedding_Frozen_At_End...
Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2
Continue training model: Embedding_Frozen_At_Start...
Train on 1175509 samples, validate on 130613 samples
Epoch 1/2
Epoch 2/2


**Prediction on validation dataset**

In [12]:
predictions = {}
for key in models.keys():
    model = models.get(key)
    predictions[key] = model.predict([val_X], batch_size=1024, verbose=1)




In [13]:
def tweak_threshold(pred, truth):
    thresholds = []
    scores = []
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        thresholds.append(thresh)
        score = metrics.f1_score(truth, (pred>thresh).astype(int))
        scores.append(score)
    return np.max(scores), thresholds[np.argmax(scores)]

In [14]:
for key in predictions.keys():
    pred_val = predictions.get(key)
    score_val, threshold_val = tweak_threshold(pred_val, val_y)
    print(f"Model '{key}' scored {round(score_val, 4)} for threshold {threshold_val} on validation data")


Model 'Embedding_Always_Trainable' scored 0.6475 for threshold 0.26 on validation data
Model 'Embedding_Frozen_At_End' scored 0.6455 for threshold 0.35 on validation data
Model 'Embedding_Frozen_At_Start' scored 0.6578 for threshold 0.34 on validation data
