# Zelros technical test : version simple (no word embedding)
Import of necessary libraries

In [1]:
import sys
import os
dir_path = os.path.dirname(os.path.realpath("./src"))
sys.path.insert(0, dir_path)
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import tensorflow as tf
from tqdm._tqdm_notebook import tqdm_notebook 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import re
import heapq
from sklearn import metrics
from keras_tqdm_mod.tqdm_notebook_callback import TQDMNotebookCallback

tqdm_notebook.pandas()
Input = tf.keras.layers.Input
Bidirectional = tf.keras.layers.Bidirectional
CuDNNLSTM = tf.compat.v1.keras.layers.CuDNNLSTM
Dense = tf.keras.layers.Dense
Dropout = tf.keras.layers.Dropout
GlobalMaxPool1D = tf.keras.layers.GlobalMaxPool1D

print("Tensorflow version : {}".format(tf.__version__))
gpus = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs available: ", len(gpus))
tf.config.experimental.set_memory_growth(gpus[0], True)


Tensorflow version : 2.0.0
Num GPUs available:  1


Global variables to use in this code

In [2]:
max_vocabulary_length = 500 # The maximum number of words in the vocabulary (171 000 words in english dictionary)
batch_size = 2000 # Training batch size
validation_batch_size = 3000 # Validation batch size
epochs = 2 # number of epoch

Create the network

In [3]:
model = tf.keras.Sequential()
model.add(Bidirectional(CuDNNLSTM(64, return_sequences = True), input_shape=(1,max_vocabulary_length)))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(16, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 1, 128)            187392    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 1, 128)            99328     
_________________________________________________________________
global_max_pooling1d (Global (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 16)                2064      
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 288,801
Trainable params: 288,801
Non-trainable params: 0
__________________________________________________

Read the dataset

In [4]:
data_df = pd.read_csv("../dataset/train.csv")
print("{} training data available".format(data_df.shape[0]))

1306122 training data available


Remove ponctuations and unnecessary spaces in sentences as well as transfer to lowercase

In [5]:
def to_lower_case_and_rm_double_spaces_poncuation(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'\W',' ',sentence) # Remove punctuation (non-word caracters)
    sentence = re.sub(r'\s+',' ',sentence) # Remove multiple space
    if sentence[-1]==" ": # Remove useless space at the end of the sentence
        sentence = sentence[:-1]
    return sentence

data_df["question_text"] = data_df["question_text"].progress_apply(to_lower_case_and_rm_double_spaces_poncuation)
print("Check the result on the first sentence : {}".format(data_df["question_text"][0]))

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))


Check the result on the first sentence : how did quebec nationalists see their province as a nation in the 1960s


Split the dataset in train data and validation data

In [6]:
train_df, val_df = train_test_split(data_df, test_size=0.1)


Test the data repartition

In [7]:
percentage_in_train = train_df.groupby("target").count()["qid"][1]/train_df.shape[0]
percentage_in_val = val_df.groupby("target").count()["qid"][1]/val_df.shape[0]
print(f"Train dataset size: {train_df.shape[0]}, validation size: {val_df.shape[0]}, "
      f"{math.floor(val_df.shape[0]*100/train_df.shape[0])}% of the training dataset size")
print("Percentage of positives in train = {:.2f} and in val {:.2f}".format(percentage_in_train,percentage_in_val))

Train dataset size: 1175509, validation size: 130613, 11% of the training dataset size
Percentage of positives in train = 0.06 and in val 0.06


Create the ```bag-of-word``` from the train data

In [8]:
voc = {} # Contain every word with their number of occurrences
for index, row in tqdm_notebook(train_df.iterrows(),total=train_df.shape[0]):
    question = row["question_text"]
    for word in question.split(" "):
        if word not in voc.keys():
            voc[word] = 1
        else:
            voc[word] += 1

print("The vocabulary contains {} words".format(len(voc)))

HBox(children=(IntProgress(value=0, max=1175509), HTML(value='')))


The vocabulary contains 184950 words


Reduce the size of the vocabulary to match the maximum value pre-defined.
We will keep the most frequent words.

In [9]:
voc_most_freq = heapq.nlargest(max_vocabulary_length, voc, key=voc.get)
for i in range(50):
    print(voc_most_freq[i])

the
what
is
a
to
in
of
i
how
and
do
are
for
you
can
why
it
my
that
if
with
on
or
have
be
does
s
from
your
an
which
should
when
get
best
would
as
people
t
there
some
who
will
like
not
at
about
they
by
was


Define the function that will vectorize a question using the vocabulary of the most frequently used words. </br>
Also define the function that will create the array of vectors from a subset of the dataframe

In [10]:
def vectorize_question(question): # We assume the question as already been formatted (lower case, no punctuation, spaces)
    vector = np.zeros(max_vocabulary_length) # Initial vector filled with zeros
    for _word in question.split(" "):
        try:
            _index = voc_most_freq.index(_word)
            vector[_index]+=1
        except ValueError:
            pass # If the word is not in the vocabulary we do nothing
    return vector

def create_vectors_from_dataframe(data):
    vectors = np.zeros((data.shape[0],1, max_vocabulary_length), dtype=np.int)
    i = 0 
    for _index, _row in data.iterrows():
        vectors[i][0] = vectorize_question(_row[1])
        i+=1
    return vectors

Define a training generator to feed data to the network, and a validation data generator to check the progress

In [11]:
def training_generator(_train_df):
    nb_batches = _train_df.shape[0]//batch_size
#     print("nb batches : ",nb_batches)
    while True:
#         print("New epoch")
        _train_df = _train_df.sample(frac=1) # shuffle the data
        for i in range(nb_batches):
            vectors = create_vectors_from_dataframe(_train_df.iloc[i*batch_size:(i+1)*batch_size])
            yield (np.asarray(vectors), np.asarray(_train_df["target"][i*batch_size:(i+1)*batch_size].values))

def validation_generator(_val_df, predict=False):
    nb_batches = _val_df.shape[0]//validation_batch_size
    
    while True:
        for i in range(nb_batches):
            vectors = create_vectors_from_dataframe(_val_df.iloc[i*batch_size:(i+1)*batch_size])
            if not predict:
                yield (np.asarray(vectors), np.asarray(_val_df["target"][i*batch_size:(i+1)*batch_size].values))
            else:
                yield np.asarray(vectors)
            


Train the network

In [12]:
generator = training_generator(train_df)
# a, b = generator.__next__()
# print(a.shape)
# print(b.shape)

print("steps per epoch = {}, epochs = {}, batch_size = {}".format(train_df.shape[0] // batch_size, epochs, batch_size))
model.fit_generator(generator, steps_per_epoch=100, epochs=epochs, verbose=0,
                   callbacks=[TQDMNotebookCallback()])

results = model.evaluate_generator(validation_generator(val_df),val_df.shape[0]//validation_batch_size)
print("On the validation dataset the loss is {:.3f} and accuracy is {:.3f}".format(results[0], results[1]))


steps per epoch = 783, epochs = 1, batch_size = 1500


HBox(children=(IntProgress(value=0, description='Training', max=1, style=ProgressStyle(description_width='init…

HBox(children=(IntProgress(value=0, description='Epoch 0', style=ProgressStyle(description_width='initial')), …



On the validation dataset the loss is 0.171 and accuracy is 0.940


Compute the predictions for all validation data

In [21]:
predictions_val = model.predict(np.asarray(create_vectors_from_dataframe(val_df), dtype=np.float16))


F1 score at the threshold 0.01 is 0.14816535498917519
F1 score at the threshold 0.02 is 0.19299128946175922
F1 score at the threshold 0.03 is 0.22976014903361022
F1 score at the threshold 0.04 is 0.25722408147108566
F1 score at the threshold 0.05 is 0.2806646893917848
F1 score at the threshold 0.06 is 0.30303167934597475
F1 score at the threshold 0.07 is 0.3210587738683485
F1 score at the threshold 0.08 is 0.33565798263193053
F1 score at the threshold 0.09 is 0.3494334439739041
F1 score at the threshold 0.1 is 0.3627145562274754
F1 score at the threshold 0.11 is 0.3743832707074008
F1 score at the threshold 0.12 is 0.383871744590085
F1 score at the threshold 0.13 is 0.39282998370450845
F1 score at the threshold 0.14 is 0.40066702038960056
F1 score at the threshold 0.15 is 0.4064365634760606
F1 score at the threshold 0.16 is 0.4119948922848787
F1 score at the threshold 0.17 is 0.4179411386711219
F1 score at the threshold 0.18 is 0.4217741935483871
F1 score at the threshold 0.19 is 0.4240

Use the F1 score to compute the threshold for insincere questions

In [None]:
_max=0
threshold = 0
for thresh_test in np.arange(0.01, 0.51, 0.01):
    thresh_test = np.round(thresh_test,2)
    F1_score = metrics.f1_score(val_df["target"],(predictions_val>thresh_test).astype(int))
    if F1_score>_max: _max,threshold = F1_score, thresh_test
    print("F1 score at the threshold {} is {}".format(thresh_test,F1_score))

print("\nBest results for a threshold of {} with F1 score of {}".format(threshold, _max))