In [1]:
# script made on 16gb of ram
import os
import io
import os.path
import random
import pickle
import zipfile
from math import ceil
from copy import deepcopy
import numpy as np
from tqdm import tqdm
import pandas as pd
import tensorflow as tf      #the progress bar
import en_core_web_sm as en  #from the spaCy library, https://spacy.io/usage/
from sklearn.metrics import f1_score

from  tensorflow.keras.backend import binary_crossentropy
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers


Using TensorFlow backend.


In [2]:
filename = "./data/train.csv"
testname = "./data/test.csv"
output = "./data/output/"

In [3]:
# enb_path = 'input/embeddings/wiki-news-300d-1M/' + 'wiki-news-300d-1M.vec'
# enb_path = 'input/embeddings/GoogleNews-vectors-negative300/'
# enb_path = 'input/embeddings/paragram_300_sl999/'
enb_path = "./data/embeddings/glove.840B.300d/glove.840B.300d.txt"

In [4]:
class Batch():
    def __init__(self, data, labels, size = 64):
        self.data = data
        self.lenght = len(self.data )
        del data
        self.labels = labels
        self.size = size
        self.index = 0
    
    def __next__(self):
        
        
        if (self.index >= self.lenght):
            raise StopIteration()  
        
        if (self.index + self.size > self.lenght):
            self.size = self.lenght - self.index 
        
        self.index += self.size  
        
        return self.data[self.index-self.size:self.index],self.labels[self.index-self.size:self.index]        
    
    def __len__(self):
        return ceil(len(self.data)/self.size)
    
    def reset(self):
        self.index = 0
    
    def __iter__(self):
        return self

In [None]:
#global variables to save memory
# _t is for tokenized
embeddings = []      #np array of embeddings
vocab = set([])          #the vocabulary from the enbeding file
questions_train = []    #every question tokenized in an array
questions_test = []
#text_t = []         #all the questions concatenated and tokenized
word_to_index = {}
index_to_word = {}

In [None]:
def tokenize_questions(text,nr_to_delete = 0):
    nlp = en.load()      #load the tokenizer
    
    #global text_t       #every token in the original order
    
    #text_t = [] 
    questions_tokenized = []
    
    bar  = tqdm(total = len(text))
    
    for batch in text:  #tqdm is the progress bar
        tokens = nlp(batch)
        batch_tokenized = []
        for token in tokens:
            word = token.string.strip()
            #text_t.append(word)
            batch_tokenized.append(word)
        questions_tokenized.append(batch_tokenized)
        bar.update(1)
    bar.close()
    return questions_tokenized

In [None]:
train_df = pd.read_csv(filename)    #test_t is for now a dataFrame

In [None]:
test_df = pd.read_csv(testname)

In [None]:
hashes = test_df.values[:,1].tolist()

In [None]:
y = train_df.values[:,2].tolist()

In [None]:
# loadig the qustions tokenized, and all qustions concatenated tokenized

if not os.path.isfile(output + "questions_test"):
    questions_test = tokenize_questions( test_df.values[:,1].tolist() )   
    with  open(output + "questions_test",'wb')  as file:
        pickle.dump(questions_test, file)
    if not os.path.isfile(output + "questions_t"):
        questions_train = tokenize_questions( train_df.values[:,1].tolist() )
        with  open(output + "questions_t",'wb')  as file:
            pickle.dump(questions_train, file)
            
    #pickle.dump(text_t, open(output + "text_t",'wb') )
else:
    print("Loading data...")
    with open(output + "questions_t", "rb" ) as file:
        questions_train = pickle.load( file )
    with open(output + "questions_test", "rb" ) as file:
        questions_test = pickle.load( file )
        
    #text_t = pickle.load( open(output + "text_t", "rb" ) )
    print("Nr of questions train: ",len(questions_train))
    print("Nr of questions test : ",len(questions_test))
    #print("Nr of tokens:    ",len(text_t))

print('The fist question tokenized from train: ', questions_train[0][:8])
print('The fist question tokenized from test : ', questions_test[0][:10])

Loading data...


In [None]:
# saving the words from all the questions so we can remove 
# the ones that does not appear from embedings to save memory

def get_questions_vocab(*args):     
    vocab = set([])
    for questions_set in args:
        for question in questions_set:
            for word in question:
                vocab.add(word)
    vocab.add('UNK') #we add an word for the unknown words in the future unseen questions 
    return vocab
    
vocab = get_questions_vocab(questions_train,questions_test)
print("Nr of unique word in questions",len(vocab))

In [None]:
def load_embeddings(enb_path,vocab):    
    
    print("Loading embeddings...")
    
    with open(enb_path, 'r', encoding ='utf-8') as file:
    
        num_lines = sum(1 for line in file)
        bar  = tqdm(total = num_lines)
    
        file.seek(0)
    
        vocab_enb = []
        embeddings = []      
        for line in file:
            bar.update(1)
            array = line[:-1].split(' ')
            word = str(array[0])
            if word in vocab:
                vocab_enb.append(word)
                embeddings.append(np.array(array[1:]).astype('float32'))      
        bar.close()
        
    embeddings = np.stack(embeddings)
    
    return vocab_enb, embeddings

In [None]:
if not os.path.isfile(output + "embeddings"):
    vocab, embeddings = load_embeddings(enb_path,vocab)
    with open( output + "embeddings",'wb') as file:
        pickle.dump(embeddings, file)
    with open( output + "vocabulary",'wb') as file:
        pickle.dump(vocab,open( output + "vocabulary",'wb'))
else:
    print("Loading embeddings...")   
    with open(output + "embeddings", "rb" ) as file:
        embeddings = pickle.load( file )
    with open( output + "vocabulary", "rb" ) as file:
        vocab = pickle.load( file )
print('Embedings size:  ',embeddings.shape)
print('Vocabulary size: ', len(vocab))

print('\nThe word "'+ vocab[2]+'" with his enmeding: \n' , embeddings[2][:5], "... ", embeddings[2][-5:] )

In [None]:
def initialze_padding(vocab,embeddings):
    #we need to add the padding word to our vocabulary
    #we need to add the padding word to our embedding matrix
    
    print("Adding the padding char to vocab and embeddings...")
    vocab = ['/pad']+vocab
    pad_emb = []
    for i in range(len(embeddings[0])):
        pad_emb.append(0)
    pad_emb = np.array(pad_emb)
    embeddings = np.vstack([pad_emb,embeddings])
    
    return vocab, embeddings

def create_dictionaries(vocab):
    word_to_index = {}
    index_to_word = {}
    
    for idx, word in enumerate(vocab): #create the two dictionaries
        word_to_index[word] = idx
        index_to_word[idx] = word
        
    return word_to_index, index_to_word

def index_question_to_words(question_indexes):
    question_words = ""
    for word in question_indexes:
        question_words += index_to_word[word] + ' '
    return question_words

def token_to_index(questions_tokenized, vocab):
    
    for question in questions_tokenized: #chage the input from strings to ids
        for idx,word_token in enumerate(question): #replace the text with the word indexes
             question[idx] = word_to_index.get(word_token, 87152) #if the token is not in vocab put "UNK"
    return questions_tokenized

def pad_questions(data, max_len = 60):
    
    print("Padding questions...")
    data_aux = deepcopy(data)
    
    padded_count = 0
    cut_count = 0
    for idx in tqdm(range(len(data_aux))):
        question_len = len(data_aux[idx])
        if question_len <= max_len:
            padded_count += 1
            for i in range(question_len, max_len):
                data_aux[idx].append(0)
        elif question_len > max_len:
            cut_count += 1
            data_aux[idx] = data_aux[idx][:max_len]
        
        if len(data_aux[idx])>max_len:
            print(question_len)
            print(question)
            break
            
    print("Padded :", padded_count)
    print("Cut    :", cut_count)
    return data_aux

In [None]:
vocab,embeddings = initialze_padding(vocab,embeddings)

word_to_index, index_to_word = create_dictionaries(vocab)

In [None]:
print('The word UNK to index       :', word_to_index['UNK'])
print('The intex for UNK to string :', index_to_word[87152])
print('The shape of enbedings      :', embeddings.shape)
print('The first 5words from vocab :',vocab[:10])

In [None]:
questions_train = token_to_index(questions_train,vocab)
questions_test = token_to_index(questions_test,vocab)

questions_train = pad_questions(questions_train)
questions_test = pad_questions(questions_test)

In [None]:
print("First train question to intdexes :")
print(index_question_to_words(questions_train[0][:17]))
print(questions_train[0][:18])

print("\nFirst test question to intdexes  :")
print(index_question_to_words(questions_test[0][:17])+"...")
print(questions_test[0][:18])

In [None]:
def embedding_lookup(batch):
    global embeddings
    
    batch_embedding = []
    
    for int_question in batch:
        question_embedding = []

        for int_word in int_question:
            try:
                question_embedding.append(embeddings[int_word])
            except Exception:
                print(int_word)
        
        enb = np.stack(question_embedding)
        batch_embedding.append(enb)
    for el in batch_embedding:
        if len(el)!=60:
            print(el.shape)
    return np.stack(batch_embedding)

In [None]:
def confusion_matrix(label, prediction):
    true_positives = 0
    false_positives = 0
    true_negatives = 0
    false_negatives = 0
 
    for i in range(0, len(label)):
        if prediction[i] == 1:
            if prediction[i] == label[i]:
                true_positives += 1
            else:
                false_positives += 1
        else:
            if prediction[i] == label[i]:
                true_negatives += 1
            else:
                false_negatives += 1
    return  true_positives, false_positives, true_negatives, false_negatives

def scores(thresholds,output,label):
    best_f1 = 0.0
    best_out = []
    thresh = 0.0
    for th_val in thresholds:
                aux_output = np.copy(output)
                aux_output[aux_output<=th_val] = bool(0)
                aux_output[aux_output>th_val] = bool(1)

                true_positives, false_positives, true_negatives, false_negatives = confusion_matrix(label,aux_output)
                accuracy = (true_positives + true_negatives) / (true_positives \
                            + true_negatives + false_positives + false_negatives)
                
                if true_positives + false_positives == 0:
                    precision = 0.0001
                else:
                    precision  = true_positives / (true_positives + false_positives)
                
                if true_positives == 0:
                    recall = 0.0001
                else:
                    recall = true_positives / (true_positives + false_negatives)
                
                f1_score = 2 / ((1 / precision) + (1 / recall))

                print('   Thresholds {:.2f}:: Acc: {:.4f} | Precision: {:.4f} | Recall: {:.4f} | F1: {:.4f}'.format(
                th_val, accuracy, precision, recall, f1_score))

                if f1_score > best_f1:
                    best_out = aux_output
                    best_f1 = f1_score
                    thresh = th_val
    return thresh,best_f1,best_out

In [None]:
def split_on_class(x,y, percentage = 80, random = True):
    
    data_in_classes = [[],[]]
    
    left_split = []
    right_split = []
    
    for idx,val in enumerate(y):                    #separate the data with target 0 from 1
        data_in_classes[int(val)].append([idx,val]) #save them as (idx,val) to save memory
    
    
    for class_data in data_in_classes:          #split the classes in two and add them to the left and right split 
        total = len(class_data)
        split = int(total*(percentage/100))
        left_split += class_data[:split]
        right_split += class_data[split:]
    
    if random:                                  #shuffle them so we lose the order of the quetions
        np.random.shuffle(right_split)
        np.random.shuffle(left_split)
      
    # make the x_train,y_train from the left_split
    left_split = np.array(left_split)
    y_left = left_split[:,1]
    left_split = left_split[:,0]
    x_left = []
    for idx_x in left_split:
        x_left.append(x[idx_x])
    del left_split
    
    # make the x_test,y_test from the right_split
    right_split = np.array(right_split)
    y_right = right_split[:,1]
    right_split = right_split[:,0]
    x_right = []
    for idx_x in right_split:
        x_right.append(x[idx_x])
    del right_split
    
            
    return x_left,x_right,y_left,y_right

In [None]:
#x_train, x_test, y_train, y_test  = split_on_class(questions_train,y)
x_train, x_val, y_train, y_val  = split_on_class(questions_train, y)
print("The leght of train data      :",len(x_train))
print("The leght of validation data :",len(x_val))
#print("The leght of test data       :",len(x_test))
print("There are",sum(y_train),"labeled positive in the train data")
print("There are",sum(y_val),"labeled positive in the validation data")
#print("There are",sum(y_test),"labeled positive in the test data")

In [None]:
def weights(x): 
    with tf.variable_scope('lreg') as scope:
        aux = GlobalMaxPool1D()(x)
        aux = Dense(16, activation="relu")(aux)
        aux = Dropout(0.1)(aux)
        aux = Dense(1, activation="sigmoid")(aux)
        out = tf.reshape(aux, [-1])

    return out

In [None]:
best_result = {'output':[],'f1_score':0,'threshold':0, 'epoch':0}

q_output = []

def run():
    global questions_test
    global embeddigs
    global x_train
    global y_train
    global x_test
    global y_test
    global x_val
    global y_val
    global best_result
    global bests_epochs
    global q_output


    batch_size = 900
    thresholds = np.arange(0.2, 0.5, 0.02)
    epochs = 15

    x = tf.placeholder(tf.float32,shape=(None, 60, 300), name = 'x')
    y = tf.placeholder(tf.float32,shape=(None, ),name='y')
    predictions = weights(x)
    loss = binary_crossentropy(target = y, output = predictions)


    x_eval = tf.placeholder(tf.float32,shape=(None, 60, 300), name = 'x_eval')
    y_eval = tf.placeholder(tf.float32,shape=(None, ),name='y_eval')
    predictions_eval = weights(x_eval)
    loss_eval = binary_crossentropy(target = y_eval, output = predictions_eval)


    optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)
    init_op = tf.global_variables_initializer()

    saver = tf.train.Saver()

    with tf.Session() as session:

        session.run(init_op)

        for e in range(1, epochs + 1):
            print("\n" + "_" * 80)
            print("Epoch nr", e, ":")
            for phase in ['train', 'val']:
                average_loss = 0
                output = np.array([])
                labels = np.array([])

                if phase == 'train':
                    batches = Batch(x_train, y_train, batch_size)
                elif phase == 'test':
                    batches = Batch(x_test, y_test, batch_size)
                elif phase == 'val':
                    batches = Batch(x_val, y_val, batch_size)

                for batch in tqdm(batches):
                    batch, label =  batch
                    batch = embedding_lookup( batch )
                    label = np.array(label)

                    if phase == 'train':
                        _ ,batch_output ,batch_loss = session.run([optimizer, predictions, loss], {x:batch, y :label})
                    else:
                        batch_output ,batch_loss = session.run([predictions, loss], {x:batch, y :label})

                    labels = np.concatenate((labels, label), axis=0)
                    output = np.concatenate((output, batch_output), axis=0)
                    average_loss += np.average(batch_loss)
                print("The "+phase+" stats :")
                print('   Loss: {:.4f}'.format(average_loss/len(batches)))
                th,best_epoch_f1, best_epoch_output = scores(thresholds,output,labels)
                if phase == 'val':
                    best_epoch_score = {'output':best_epoch_output,'f1_score':best_epoch_f1,'threshold':th} 
                    if best_epoch_f1 > best_result['f1_score']:
                        best_result['output'] = best_epoch_output
                        best_result['f1_score'] = best_epoch_f1
                        best_result['threshold'] = th
                        best_result['epoch'] = e
                        q_output = []
                        #get predictions for the test datafrom the competision
                        batches = Batch(questions_test, questions_test, batch_size)
                        for batch in tqdm(batches):
                            batch, label =  batch
                            batch = embedding_lookup( batch )
                            label = np.array(label)
                            batch_output = session.run(predictions, {x: batch})
                            q_output = np.concatenate((q_output, batch_output), axis=0)
        return q_output

In [None]:
run()

In [None]:
print('Thresholds {}:: F1: {:.4f}, epoch: {}'.format(best_result['threshold'], best_result['f1_score'],best_result['epoch']))