# Zelros technical test : improved version (word embedding)
## Word embedding using glove given in the competition data
Import of necessary libraries

In [1]:
import sys
import os
dir_path = os.path.dirname(os.path.realpath("./src"))
sys.path.insert(0, dir_path)
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import tensorflow as tf
from tqdm._tqdm_notebook import tqdm_notebook 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import re
import heapq
from sklearn import metrics
from keras_tqdm_mod.tqdm_notebook_callback import TQDMNotebookCallback

tqdm_notebook.pandas()
Input = tf.keras.layers.Input
Bidirectional = tf.keras.layers.Bidirectional
CuDNNLSTM = tf.compat.v1.keras.layers.CuDNNLSTM
Dense = tf.keras.layers.Dense
Dropout = tf.keras.layers.Dropout
GlobalMaxPool1D = tf.keras.layers.GlobalMaxPool1D

print("Tensorflow version : {}".format(tf.__version__))
gpus = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs available: ", len(gpus))
tf.config.experimental.set_memory_growth(gpus[0], True)


Tensorflow version : 2.0.0
Num GPUs available:  1


Global variables to use in this code

In [2]:
maxlen = 30 # maximum number of words to be used by question
batch_size = 200 # Training batch size
validation_batch_size = 200 # Validation batch size
epochs = 2 # number of epoch

Create the network

In [3]:
model = tf.keras.Sequential()
model.add(Bidirectional(CuDNNLSTM(64, return_sequences = True), input_shape=(maxlen,300)))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(16, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 30, 128)           187392    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 30, 128)           99328     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 288,801
Trainable params: 288,801
Non-trainable params: 0
__________________________________________________

Get the Glove pretrained data

In [4]:
# Source https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
embeddings_index = {}
f = open("../dataset/embeddings/glove.840B.300d/glove.840B.300d.txt", "r",  encoding="utf8")
for line in tqdm_notebook(f, desc="Reading Glove data"):
    values = line.split(" ")
    word = values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word] = coefs
f.close()

print("Found %s word vectors." % len(embeddings_index))

HBox(children=(IntProgress(value=1, bar_style='info', description='Reading Glove data', max=1, style=ProgressS…


Found 2196016 word vectors.


Read the dataset

In [5]:
data_df = pd.read_csv("../dataset/train.csv")
print("{} training data available".format(data_df.shape[0]))

1306122 training data available


Remove ponctuations and unnecessary spaces in sentences as well as transfer to lowercase

In [6]:
def rm_double_spaces(sentence):
    sentence = re.sub(r'\s+',' ',sentence) # Remove multiple space
    if sentence[-1]==" ": # Remove useless space at the end of the sentence
        sentence = sentence[:-1]
    return sentence

data_df["question_text"] = data_df["question_text"].progress_apply(rm_double_spaces)
print("Check the result on the first sentence : {}".format(data_df["question_text"][0]))

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))


Check the result on the first sentence : How did Quebec nationalists see their province as a nation in the 1960s?


Split the dataset in train data and validation data

In [7]:
train_df, val_df = train_test_split(data_df, test_size=0.1)
del data_df #no longer needed

Test the data repartition

In [8]:
percentage_in_train = train_df.groupby("target").count()["qid"][1]/train_df.shape[0]
percentage_in_val = val_df.groupby("target").count()["qid"][1]/val_df.shape[0]
print(f"Train dataset size: {train_df.shape[0]}, validation size: {val_df.shape[0]}, "
      f"{math.floor(val_df.shape[0]*100/train_df.shape[0])}% of the training dataset size")
print("Percentage of positives in train = {:.2f} and in val {:.2f}".format(percentage_in_train,percentage_in_val))

Train dataset size: 1175509, validation size: 130613, 11% of the training dataset size
Percentage of positives in train = 0.06 and in val 0.06


Create the ```Glove embedding``` from the train data (a word is a vector of 300)

In [9]:
#Convert to embeddings
def question_to_vect(question):
    empty_emb = np.zeros(300)
    words = question.split()[:maxlen]
    embeds = [embeddings_index.get(x, empty_emb) for x in words] # Get the embedding if it exists otherwise empty_emb
    embeds+= [empty_emb] * (maxlen - len(embeds)) # Fill the list of vectors with empty_emb if the question it shorter
    return np.array(embeds, dtype=np.float16)

Define a training generator to feed data to the network, and a validation data generator to check the progress

In [26]:
def training_generator(_train_df):
    nb_batches = _train_df.shape[0]//batch_size
    while True:
        _train_df = _train_df.sample(frac=1) # shuffle the data
        for i in range(nb_batches):
            questions = _train_df.iloc[i*batch_size:(i+1)*batch_size, 1]
            vect_questions = np.asarray([question_to_vect(question) for question in questions])
            yield (np.asarray(vect_questions), np.asarray(_train_df["target"][i*batch_size:(i+1)*batch_size].values))

def validation_generator(_val_df, predict=False):
    nb_batches = _val_df.shape[0]//validation_batch_size
    while True:
        for i in range(nb_batches):
            questions = _val_df.iloc[i*batch_size:(i+1)*batch_size, 1]
            vect_questions = np.asarray([question_to_vect(question) for question in questions])
            if not predict:
                yield (np.asarray(vect_questions),np.asarray(_val_df["target"][i*batch_size:(i+1)*batch_size].values))
            else:
                yield np.asarray(vect_questions)

Train the network

In [11]:
generator = training_generator(train_df)

print("steps per epoch = {}, epochs = {}, batch_size = {}".format(train_df.shape[0] // batch_size, epochs, batch_size))
model.fit_generator(generator, steps_per_epoch=train_df.shape[0] // batch_size, epochs=epochs, verbose=0,
                   callbacks=[TQDMNotebookCallback()])

steps per epoch = 5877, epochs = 2, batch_size = 200


HBox(children=(IntProgress(value=0, description='Training', max=2, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Epoch 0', max=5877, style=ProgressStyle(description_width='in…




HBox(children=(IntProgress(value=0, description='Epoch 1', max=5877, style=ProgressStyle(description_width='in…





<tensorflow.python.keras.callbacks.History at 0x24bfac42320>

Evaluate the network

In [12]:
results = model.evaluate_generator(validation_generator(val_df),val_df.shape[0]//validation_batch_size)
print("On the validation dataset the loss is {:.3f} and accuracy is {:.3f}".format(results[0], results[1]))

On the validation dataset the loss is 0.109 and accuracy is 0.958


Compute the predictions for all validation data

In [28]:
predictions_val = model.predict_generator(validation_generator(val_df, predict=True), 
                                          steps = val_df.shape[0]//validation_batch_size)

Use the F1 score to compute the threshold for insincere questions

In [31]:
_max=0
threshold = 0
for thresh_test in np.arange(0.01, 0.51, 0.01):
    thresh_test = np.round(thresh_test,2)
    F1_score = metrics.f1_score(val_df.iloc[:(val_df.shape[0]//validation_batch_size)*validation_batch_size, 2],
                                (predictions_val>thresh_test).astype(int))
    if F1_score>_max: _max,threshold = F1_score, thresh_test
    print("F1 score at the threshold {} is {}".format(thresh_test,F1_score))

print("\nBest results for a threshold of {} with F1 score of {}".format(threshold, _max))

F1 score at the threshold 0.01 is 0.35574196874194525
F1 score at the threshold 0.02 is 0.43605919462295856
F1 score at the threshold 0.03 is 0.4836002541041159
F1 score at the threshold 0.04 is 0.5143624940988488
F1 score at the threshold 0.05 is 0.537293831356947
F1 score at the threshold 0.06 is 0.5561475576524179
F1 score at the threshold 0.07 is 0.571392200958968
F1 score at the threshold 0.08 is 0.5831172825943666
F1 score at the threshold 0.09 is 0.5931834662799129
F1 score at the threshold 0.1 is 0.6010338564709169
F1 score at the threshold 0.11 is 0.6091399646626235
F1 score at the threshold 0.12 is 0.6151144069863882
F1 score at the threshold 0.13 is 0.6216162420382166
F1 score at the threshold 0.14 is 0.6250760186499088
F1 score at the threshold 0.15 is 0.628285743737759
F1 score at the threshold 0.16 is 0.6298783046579941
F1 score at the threshold 0.17 is 0.6333209173628479
F1 score at the threshold 0.18 is 0.6374553716325868
F1 score at the threshold 0.19 is 0.641087480815