# Zelros technical test : version simple (no word embedding)
Import of necessary libraries

In [None]:
import sys
import os
dir_path = os.path.dirname(os.path.realpath("./src"))
sys.path.insert(0, dir_path)
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import tensorflow as tf
from tqdm._tqdm_notebook import tqdm_notebook 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import math
import re
import heapq
from keras_tqdm_mod.tqdm_notebook_callback import TQDMNotebookCallback

tqdm_notebook.pandas()
Input = tf.keras.layers.Input
Bidirectional = tf.keras.layers.Bidirectional
CuDNNLSTM = tf.compat.v1.keras.layers.CuDNNLSTM
Dense = tf.keras.layers.Dense
Dropout = tf.keras.layers.Dropout
GlobalMaxPool1D = tf.keras.layers.GlobalMaxPool1D

print("Tensorflow version : {}".format(tf.__version__))
gpus = tf.config.experimental.list_physical_devices('GPU')
print("Num GPUs Available: ", len(gpus))
tf.config.experimental.set_memory_growth(gpus[0], True)


Global variables to use in this code

In [None]:
max_vocabulary_length = 300 # The maximum number of words in the vocabulary (171 000 words in english dictionary)
batch_size = 3000 # Training batch size
validation_batch_size = 3000 # Validation batch size
epochs = 6 # number of epoch

Create the network

In [None]:
model = tf.keras.Sequential()
model.add(Bidirectional(CuDNNLSTM(64, return_sequences = True), input_shape=(1,max_vocabulary_length)))
model.add(Bidirectional(CuDNNLSTM(64, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(16, activation="relu"))
model.add(Dropout(0.1))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())


Read the dataset

In [None]:
data_df = pd.read_csv("../dataset/train.csv")
print("{} training data available".format(data_df.shape[0]))

Remove ponctuations and unnecessary spaces in sentences as well as transfer to lowercase

In [None]:
def to_lower_case_and_rm_double_spaces_poncuation(sentence):
    sentence = sentence.lower()
    sentence = re.sub(r'\W',' ',sentence) # Remove punctuation (non-word caracters)
    sentence = re.sub(r'\s+',' ',sentence) # Remove multiple space
    if sentence[-1]==" ": # Remove useless space at the end of the sentence
        sentence = sentence[:-1]
    return sentence

data_df["question_text"] = data_df["question_text"].progress_apply(to_lower_case_and_rm_double_spaces_poncuation)
print("Check the result on the first sentence : {}".format(data_df["question_text"][0]))

Split the dataset in train data and validation data

In [None]:
train_df, val_df = train_test_split(data_df, test_size=0.1)


Test the data repartition

In [None]:
percentage_in_train = train_df.groupby("target").count()["qid"][1]/train_df.shape[0]
percentage_in_val = val_df.groupby("target").count()["qid"][1]/val_df.shape[0]
print(f"Train dataset size: {train_df.shape[0]}, validation size: {val_df.shape[0]}, "
      f"{math.floor(val_df.shape[0]*100/train_df.shape[0])}% of the training dataset size")
print("Percentage of positives in train = {:.2f} and in val {:.2f}".format(percentage_in_train,percentage_in_val))

Create the ```bag-of-word``` from the train data

In [None]:
voc = {} # Contain every word with their number of occurrences
for index, row in tqdm_notebook(train_df.iterrows(),total=train_df.shape[0]):
    question = row["question_text"]
    for word in question.split(" "):
        if word not in voc.keys():
            voc[word] = 1
        else:
            voc[word] += 1

print("The vocabulary contains {} words".format(len(voc)))

Reduce the size of the vocabulary to match the maximum value pre-defined.
We will keep the most frequent words.

In [None]:
voc_most_freq = heapq.nlargest(max_vocabulary_length, voc, key=voc.get)
for i in range(50):
    print(voc_most_freq[i])

Define the function that will vectorize a question using the vocabulary of the most frequently used words. </br>
Also define the function that will create the array of vectors from a subset of the dataframe

In [None]:
def vectorize_question(question): # We assume the question as already been formatted (lower case, no punctuation, spaces)
    vector = np.zeros(max_vocabulary_length) # Initial vector filled with zeros
    for _word in question.split(" "):
        try:
            _index = voc_most_freq.index(_word)
            vector[_index]+=1
        except ValueError:
            pass # If the word is not in the vocabulary we do nothing
    return vector

def create_vectors_from_dataframe(data):
    vectors = np.zeros((data.shape[0],1, max_vocabulary_length), dtype=np.int)
    i = 0 
    for _index, _row in data.iterrows():
        vectors[i][0] = vectorize_question(_row[1])
        i+=1
    return vectors

Define a training generator to feed data to the network, and a validation data generator to check the progress

In [None]:
def training_generator(_train_df):
    nb_batches = _train_df.shape[0]//batch_size
#     print("nb batches : ",nb_batches)
    while True:
#         print("New epoch")
        _train_df = _train_df.sample(frac=1) # shuffle the data
        for i in range(nb_batches):
            vectors = create_vectors_from_dataframe(_train_df.iloc[i*batch_size:(i+1)*batch_size])
            yield (np.asarray(vectors), np.asarray(_train_df["target"][i*batch_size:(i+1)*batch_size].values))

def validation_generator(_val_df):
    nb_batches = _val_df.shape[0]//validation_batch_size
    
    while True:
        for i in range(nb_batches):
            vectors = create_vectors_from_dataframe(_val_df.iloc[i*batch_size:(i+1)*batch_size])
            yield (np.asarray(vectors), np.asarray(_val_df["target"][i*batch_size:(i+1)*batch_size].values))


Train the network

In [None]:
generator = training_generator(train_df)
# a, b = generator.__next__()
# print(a.shape)
# print(b.shape)

print("steps per epoch = {}, epochs = {}, batch_size = {}".format(train_df.shape[0] // batch_size, epochs, batch_size))
model.fit_generator(generator, steps_per_epoch=train_df.shape[0] // batch_size, epochs=epochs, verbose=0,
                   callbacks=[TQDMNotebookCallback()])

model.evaluate_generator(validation_generator(val_df),val_df.shape[0]//validation_batch_size)


Compute the F1 score