## Setting Up

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import re

# Use tqdm to show progress of an pandas function we use
tqdm.pandas()

from gensim.models import KeyedVectors as kv
from gensim.scripts.glove2word2vec import glove2word2vec

embedding_path_dict= {'googlenews':{
                            'path':'../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin',
                            'format':'word2vec',
                            'binary': True
                      },
                      'glove':{
                            'path':'../input/embeddings/glove.840B.300d/glove.840B.300d.txt',
                            'format': 'glove',
                            'binary': ''
                      },
                      'glove_word2vec':{
                            'path':'../input/glove.840B.300d.txt.word2vec',
                            'format': 'word2vec',
                            'binary': False
                      },
                      'wiki':{
                            'path': '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec',
                            'format': 'word2vec',
                            'binary': False
                      },
                      'paragram':{
                            'path': '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt',
                            'format': '',
                            'binary': False
                      }
                    }


## Get Training and Test Data

In [None]:
train=pd.read_csv("../input/train.csv")
test= pd.read_csv("../input/test.csv")
print("Train shape:", train.shape)
print("Test shape:", test.shape)

In [None]:
train.head()

In [None]:
train = train.loc[train.question_text.str.len()>100]

In [None]:
len(train.loc[train['target']==0])

In [None]:
num_pos= len(train.loc[train['target']==1])
print(num_pos)

In [None]:
len(train['target'])

Training set positive and negative examples are very unbalanced

In [None]:
balanced_train= train.loc[train['target']==1]

In [None]:
balanced_train = balanced_train.append(train.loc[train['target']==0].sample(n=num_pos), ignore_index=True)

In [None]:
len(balanced_train.loc[balanced_train['target']==1])

In [None]:
del train
import gc
gc.collect()

## Choose Word Embeddings

### Functions: Embedding-Related Functions

In [None]:
# Get word embeddings
def get_embeddings(embedding_path_dict, emb_name):
    """
    :params embedding_path_dict: a dictionary containing the path, binary flag, and format of the desired embedding,
            emb_name: the name of the embedding to retrieve
    :return embedding index: a dictionary containing the embeddings"""
    
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    
    embeddings_index = {}
    if (emb_name == 'googlenews'):
        emb_path = embedding_path_dict[emb_name]['path']
        bin_flag = embedding_path_dict[emb_name]['binary']
        embeddings_index = kv.load_word2vec_format(emb_path, binary=bin_flag).vectors
    elif (emb_name in ['glove', 'wiki']):
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_path_dict[emb_name]['path']) if len(o)>100)    
    elif (emb_name == 'paragram'):
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_path_dict[emb_name]['path'], encoding="utf8", errors='ignore'))
    return embeddings_index

#Convert GLoVe format into word2vec format
def glove_to_word2vec(embedding_path_dict, emb_name='glove', output_emb='glove_word2vec'):
    """
    Convert the GLOVE embedding format to a word2vec format
    :params embedding_path_dict: a dictionary containing the path, binary flag, and format of the desired embedding,
            glove_path: the name of the GLOVE embedding
            output_file_path: the name of the converted embedding in embedding_path_dict. 
    :return output from the glove2word2vec script
    """
    glove_input_file = embedding_path_dict[emb_name]['path']
    word2vec_output_file = embedding_path_dict[output_emb]['path']                
    return glove2word2vec(glove_input_file, word2vec_output_file)


In [None]:
# Get stats of a given embeddings index
def get_emb_stats(embeddings_index):

    # Put all embeddings in a numpy matrix
    all_embs= np.stack(embeddings_index.values())

    # Get embedding stats
    emb_mean = all_embs.mean()
    emb_std = all_embs.std()
    
    num_embs = all_embs.shape[0]
    
    emb_size = all_embs.shape[1]
    
    return emb_mean,emb_std, num_embs, emb_size 

### Functions: Tokenize Training Sentences

In [None]:
contr_dict={"I\'m": "I am",
            "won\'t": "will not",
            "\'s" : "", 
            "\'ll":"will",
            "\'ve":"have",
            "n\'t":"not",
            "\'re": "are",
            "\'d": "would",
            "y'all": "all of you",
            "Quoran": "Quora contributor",
            "quoran": "quora contributor"
            }

# Converts sentences into lists of tokens
# We use this function to allow more control over what constitutes a word
# It also allows us to explore ways to cover more the pre-defined word embeddings.

def tokenize(sentences, restrict_to_len=-1):
    """
    :params sentence_list: list of strings
    :returns tok_sentences: list of list of tokens
    """
    
    if restrict_to_len>0:
        tok_sentences = [re.findall(r"[\w]+[']*[\w]+|[\w]+|[.,!?;]", x ) \
                         for x in sentences if len(x)>restrict_to_len]
    else:
        tok_sentences = [re.findall(r"[\w]+[']*[\w]+|[\w]+|[.,!?;]", x ) \
                         for x in sentences] 
    return tok_sentences

#Build the vocabulary given a list of sentence words
def get_vocab(sentences, verbose= True):
    """
    :param sentences: a list of list of words
    :return: a dictionary of words and their frequency 
    """
    vocab={}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] +=1
            except KeyError:
                vocab[word] = 1
    return vocab

def repl(m):
    return '#' * len(m.group())

#Convert numerals to a # sign
def convert_num_to_pound(sentences):
    return sentences.progress_apply(lambda x: re.sub("[1-9][\d]+", repl, x)).values

def replace_contractions(sentences, contr_dict=contr_dict):
    res_sentences=[]
    for sent in sentences:
        for contr in contr_dict:
            sent = sent.replace(contr, " "+contr_dict[contr])
        res_sentences.append(sent)
    return res_sentences

def convert_height(sentences):
    res_sentences = []
    for sent in sentences:
        res_sent = re.sub( "(\d+)\'(\d+)", "\1 foot \2", sent)
        res_sentences.append(res_sent)
    return res_sentences

def convert_to_lower(sentences):
    res_sentences = []
    for sent in sentences:
        lower_sent = sent.lower()
        res_sentences.append(lower_sent)
    return res_sentences


### Functions: Train for an embedding

In [None]:
def get_emb_matrix(sentences, embeddings_index, emb_mean, emb_std,\
                   emb_size, max_num_tokens = 300000 ):
    
    # max_num_tokens id Vocabulary size limit
    vocab = get_vocab(sentences)

    # maximum vocabulary size
    num_words = min(max_num_tokens, len(vocab.keys()))

    # words not in pre-trained embedding are given freature values 
    # drawn from a normal distribution with emb_mean and emb_std
    # This initialization is less random than initializing with 0.
    embedding_matrix = np.random.normal(emb_mean, emb_std, (num_words, emb_size))

    # Get embeddings of training vocabulary
    for i, word in enumerate(vocab.keys()):
        if i >= max_num_tokens: continue
        if (word in embeddings_index):
            embedding_vector = embeddings_index[word]
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector

    # we don't use vocab later
    del vocab
    
    return embedding_matrix

In [None]:
def choose_emb_and_train(embedding_name,train, maxlen=100): #train should be balanced_train
    
    # get the embeddings
    embeddings_index= get_embeddings(embedding_path_dict, embedding_name)

    # Get embedding stats
    emb_mean,emb_std, num_embs, emb_size = get_emb_stats(embeddings_index)
    print("mean: %5.5f\nstd: %5.5f\nnumber of embeddings: %d\nembedding vector size:%d" \
          %(emb_mean,emb_std, num_embs, emb_size))
    
    # Tokenize training set
    if (embedding_name == 'googlenews'):
        # Google replaces digits in numbers > 9 with # signs
        question_text = convert_num_to_pound(train["question_text"])
    else:
        question_text = train["question_text"]

    # start by replacing heights such as 5'4 to a longer format (5 foot 4)
    sentences = convert_height(question_text)

    # No need to convert capitals to lower case for GloVe as it has both in its embeddings
    if (embedding_name == 'paragram'):
        # convert capitals to lowercase
        sentences = convert_to_lower(sentences)

    # replace contractions
    sentences = replace_contractions(sentences)

    # Get a list of token for each question text
    # restrict_to_len is approximately the mean sentence length+ 0.5std
    sentences = tokenize(sentences, restrict_to_len=maxlen)
    
    # get embeddings matrix
    embedding_matrix = get_emb_matrix(sentences, embeddings_index, emb_mean, emb_std,\
                   emb_size, max_num_tokens = 300000 )
    
    return embedding_matrix,emb_mean,emb_std, num_embs, emb_size

In [None]:
import statistics as st

# Get the mean, median, and maximum question length, as well as the standard deviation
def get_set_stats(given_sent_set):
    question_len=tqdm([len(x) for x in given_sent_set])
    maxlen= max(question_len)
    minlen= min(question_len)
    mean_len = st.mean(question_len)
    std_len = st.stdev(question_len)
    median = st.median(question_len)

    return maxlen, minlen, mean_len, std_len, median

In [None]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

def preprocess_input_sets(input_set, keras_tokenizer,\
                          num_words=0, maxlen=300, \
                          test=False, lower=False,\
                          conv_height=False, contractions=False):
    # Get train and val text
    X = input_set["question_text"].fillna("_na_").values
    
    if conv_height:
        # start by replacing heights such as 5'4 to a longer format (5 foot 4)
        X = convert_height(X)

    if lower:
        #convert capitals to lower case
        X = convert_to_lower(X)

    if contractions:
        # replace contractions
        X = replace_contractions(X)

    Y=None
    if not(test):
        # Get target classes
        Y = input_set["target"].fillna("_na_").values

    if (keras_tokenizer is None):
        if (num_words > 0):
            keras_tokenizer = Tokenizer(num_words= num_words)
            keras_tokenizer.fit_on_texts(list(X))
        else:
            print("Num words is required to create Keras Tokenizer object")
            return None, None, None

    try:
        X = keras_tokenizer.texts_to_sequences(X)
    except NameError:
        print("Tokenizer object not defined!")
        return None, None

    ## Pad the sentences 
    X = pad_sequences(X, maxlen=maxlen)
    
    return X, Y, keras_tokenizer


## Model-related functions

In [None]:
def build_model (embedding_matrix, emb_size=300, max_len=100, voc_size=50000):
    input = Input(shape=(max_len,))
    embed = Embedding(voc_size, emb_size, weights=[embedding_matrix])(input)
    
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(embed)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dense(16, activation="relu")(x)
    x = Dropout(0.1)(x)
    
    y = Bidirectional(CuDNNGRU(64, return_sequences=True))(embed)
    y = Bidirectional(CuDNNLSTM(64, return_sequences=True))(y)
    y = GlobalMaxPool1D()(y)
    y = Dense(16, activation="relu")(y)
    y = Dropout(0.1)(y)
    
    z= Concatenate()([x,y])
    
    output = Dense(1, activation="sigmoid")(z)
    
    model = Model (inputs=input, outputs=output)
    return model

In [None]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

def train_network(model_name, model, train_X, train_Y,\
                  val_X, val_Y, \
                  batch_size = 1500, epochs = 10,\
                  monitor='val_loss', mode='min'):
    
    early_stopping = EarlyStopping(patience=3, verbose=1, monitor=monitor, mode=mode)
    model_checkpoint = ModelCheckpoint(model_name, save_best_only=True, verbose=1, \
                                       monitor=monitor, mode=mode)
    reduce_lr = ReduceLROnPlateau(factor=0.5, patience=3, min_lr=0.0001, verbose=1)

    hist = model.fit(train_X, train_Y, batch_size=batch_size, epochs=epochs,\
                     validation_data=(val_X, val_Y), verbose=True, \
                     callbacks=[early_stopping,model_checkpoint, reduce_lr])

    return hist

### GloVe

#### Get embeddings and  Create Training Embedding Matrices

In [None]:
maxlen = 100
num_words=50000

In [None]:
embedding_matrix,emb_mean,emb_std, num_embs, emb_size = \
choose_emb_and_train('glove',balanced_train, maxlen=100)

In [None]:
# print embedding stats
print("mean: %5.5f\nstd: %5.5f\nnumber of embeddings: %d\nembedding vector size:%d" \
      %(emb_mean,emb_std, num_embs, emb_size))

In [None]:
embedding_matrix.shape

#### Create Training and  Validation sets

In [None]:
from sklearn.model_selection import train_test_split

training_set, val_set = train_test_split(balanced_train, test_size=0.1)


### Get training, test, and val set stats

In [None]:
maxlen_train, minlen_train, mean_len_train, std_len_train, median_train = get_set_stats(training_set["question_text"])
print("Question Length Stats in Training set:\n")
print("\tmaximum:%d\n\tminimum:%d\n\tmean:%d\n\tmedian:%d\n\tstd:%d"% \
      (maxlen_train, minlen_train, mean_len_train, median_train, std_len_train))

In [None]:
maxlen_val, minlen_val, mean_len_val, std_len_val, median_val = get_set_stats(val_set["question_text"])
print("Question Length Stats in Validation set:\n")
print("\tmaximum:%d\n\tminimum:%d\n\tmean:%d\n\tmedian:%d\n\tstd:%d"% \
      (maxlen_val, minlen_val, mean_len_val, median_val, std_len_val))

In [None]:
maxlen_test, minlen_test, mean_len_test, std_len_test, median_test = get_set_stats(test["question_text"])
print("Question Length Stats in Test set:\n")
print("\tmaximum:%d\n\tminimum:%d\n\tmean:%d\n\tmedian:%d\n\tstd:%d"% \
      (maxlen_test, minlen_test, mean_len_test, median_test, std_len_test))

Hmmm... Same mean, median, and std but different size ranges! Needs more investigation (To Do)

### Tokenize, preprocess, and pad training, val, and test sets

In [None]:
# Convert token lists into sequences
train_X, train_Y, keras_tokenizer = preprocess_input_sets(training_set, None,\
                                                          num_words=50000,\
                                                          maxlen=maxlen, \
                                                          lower=False,\
                                                          conv_height=True, \
                                                          contractions=True)
val_X, val_Y, _ = preprocess_input_sets(val_set, keras_tokenizer,lower=False,\
                         conv_height=True, contractions=True)


## Model

### Build the network

In [None]:
# Free up memory
#del embeddings_index
#import time, gc; gc.collect()
#time.sleep(10)

In [None]:
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.layers import Bidirectional, GlobalMaxPool1D, LSTM, Dense, Input
from keras.layers import Flatten
from keras.layers import CuDNNLSTM, CuDNNGRU, Concatenate, Dense,  Dropout
from keras.models import Model


data_dim = 16
timesteps = 8
num_classes = 1 #10
batch_size = 32
num_mem_units = 100

In [None]:
import os

model_name='./glove_model.model'
if (os.path.isfile(model_name)):
    model = load(model_name)
else:
    model = build_model(embedding_matrix,  emb_size = embedding_matrix.shape[1],\
                        max_len=maxlen, voc_size=embedding_matrix.shape[0])
    model.compile(loss='binary_crossentropy',   
                  optimizer='adam', #rmsprop',            
                  metrics=['accuracy'])

# summarize the model
print(model.summary())

### Train the network

In [None]:
hist = train_network(model_name, model, train_X, train_Y,\
                      val_X, val_Y, \
                      batch_size = 1500, epochs = 10,\
                      monitor='val_loss', mode='min')

In [None]:
#model = load_model('./glove_model.model')

In [None]:
print("total preds:%d negative:%d pos:%d\n"% (len(val_Y), np.count_nonzero(val_Y), len(val_Y)-np.count_nonzero(val_Y)))

### Find best threshold cutoff

In [None]:
from sklearn import metrics

pred_val_Y= model.predict([val_X], batch_size=batch_size, verbose=1)

max_f1 = 0.0
max_thresh = 0.0
for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        curr_f1 = metrics.f1_score(val_Y, (pred_val_Y>thresh).astype(int))
        if max_f1>curr_f1:
            max_f1 = curr_f1
            max_thresh = thresh
        print("Threshold:%1.2f F1 Score:%5.5f"%(thresh, curr_f1))

## Run on Test

In [None]:
test= pd.read_csv("../input/test.csv")
print("Test shape:", test.shape)

In [None]:
pred_X, _, _ = preprocess_input_sets(test, keras_tokenizer, test=True)

In [None]:
pred_Y= model.predict([pred_X], batch_size=1, verbose=1)

In [None]:
len(pred_Y[pred_Y<0.5])

In [None]:
len(pred_Y[pred_Y>=0.5])

In [None]:
tmp_pred = (pred_Y>(0.5)).astype(int)

In [None]:
len(tmp_pred[tmp_pred==0])

### Save Test

In [None]:
len(pred_X)

In [None]:
len(pred_Y)

In [None]:
len(test["qid"])

In [None]:
test_res= pd.DataFrame({"qid":test["qid"].values})
test_pred = (pred_Y>0.5).astype(int)
test_res['prediction'] = test_pred
test_res.head()


In [None]:
#test_res.to_csv("glove_submission.csv", index=False)

## LSTM with Paragrams Embeddings

### Get Embeddings Index and  Tokenize Training Sentences

In [None]:
embedding_name = 'paragram'
embedding_matrix,emb_mean,emb_std, num_embs, emb_size = \
choose_emb_and_train(embedding_name,balanced_train, maxlen=100)

In [None]:
# print embedding stats
print("mean: %5.5f\nstd: %5.5f\nnumber of embeddings: %d\nembedding vector size:%d" \
      %(emb_mean,emb_std, num_embs, emb_size))

In [None]:
embedding_matrix.shape

#### Create Training and  Validation sets

In [None]:
from sklearn.model_selection import train_test_split

training_set, val_set = train_test_split(balanced_train, test_size=0.1)


### Get training, test, and val set stats

In [None]:
maxlen_train, minlen_train, mean_len_train, std_len_train, median_train = get_set_stats(training_set["question_text"])
print("Question Length Stats in Training set:\n")
print("\tmaximum:%d\n\tminimum:%d\n\tmean:%d\n\tmedian:%d\n\tstd:%d"% \
      (maxlen_train, minlen_train, mean_len_train, median_train, std_len_train))

In [None]:
maxlen_val, minlen_val, mean_len_val, std_len_val, median_val = get_set_stats(val_set["question_text"])
print("Question Length Stats in Validation set:\n")
print("\tmaximum:%d\n\tminimum:%d\n\tmean:%d\n\tmedian:%d\n\tstd:%d"% \
      (maxlen_val, minlen_val, mean_len_val, median_val, std_len_val))

In [None]:
maxlen_test, minlen_test, mean_len_test, std_len_test, median_test = get_set_stats(test["question_text"])
print("Question Length Stats in Test set:\n")
print("\tmaximum:%d\n\tminimum:%d\n\tmean:%d\n\tmedian:%d\n\tstd:%d"% \
      (maxlen_test, minlen_test, mean_len_test, median_test, std_len_test))

### Tokenize, preprocess, and pad training, val, and test sets

In [None]:
# Convert token lists into sequences
train_X, train_Y, keras_tokenizer = preprocess_input_sets(training_set, None, num_words, \
                                                         lower=True,\
                                                         conv_height=True, \
                                                         contractions=True)
val_X, val_Y, _ = preprocess_input_sets(val_set, keras_tokenizer,lower=True,\
                         conv_height=True, contractions=True)


### Build the Network

In [None]:
model_name='./para_model.model'
if (os.path.isfile(model_name)):
    model = load(model_name)
else:
    model_p = build_model(embedding_matrix, emb_size = embedding_matrix.shape[1],\
                    max_len=maxlen, voc_size=embedding_matrix.shape[0])
    model.compile(loss='binary_crossentropy',   
                  optimizer='adam', #rmsprop',            
                  metrics=['accuracy'])
# summarize the model
print(model.summary())

### Train the Network

In [None]:
hist = train_network(model_name, model, train_X, train_Y,\
                  val_X, val_Y, \
                  batch_size = 1500, epochs = 10,\
                  monitor='val_loss', mode='min')

In [None]:
print("total preds:%d negative:%d pos:%d\n"% (len(val_Y), np.count_nonzero(val_Y), len(val_Y)-np.count_nonzero(val_Y)))

### Find best threshold cutoff

In [None]:
from sklearn import metrics

para_pred_val_Y= model.predict([val_X], batch_size=batch_size, verbose=1)

max_f1 = 0.0
max_thresh = 0.0
for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        curr_f1 = metrics.f1_score(val_Y, (para_pred_val_Y>thresh).astype(int))
        if max_f1>curr_f1:
            max_f1 = curr_f1
            max_thresh = thresh
        print("Threshold:%1.2f F1 Score:%5.5f"%(thresh, curr_f1))

## Run on Test

In [None]:
test= pd.read_csv("../input/test.csv")
print("Test shape:", test.shape)

In [None]:
para_pred_X, _, _ = preprocess_input_sets(test, keras_tokenizer, test=True)

In [None]:
para_pred_Y= model.predict([para_pred_X], batch_size=1, verbose=1)

In [None]:
len(para_pred_Y[para_pred_Y<0.5])

In [None]:
len(para_pred_Y[para_pred_Y>=0.5])

In [None]:
tmp_pred = (para_pred_Y>(0.5)).astype(int)

In [None]:
len(tmp_pred[tmp_pred==0])

### Save Test

In [None]:
len(para_pred_X)

In [None]:
len(para_pred_Y)

In [None]:
len(test["qid"])

In [None]:
para_test_res= pd.DataFrame({"qid":test["qid"].values})
para_test_pred = (para_pred_Y>0.5).astype(int)
para_test_res['prediction'] = para_test_pred
para_test_res.head()


In [None]:
#test_res.to_csv("para_submission.csv", index=False)

## LSTM with Wiki Embeddings

### Get Wiki Embeddings Index

In [None]:
embedding_name = 'wiki'
embedding_matrix,emb_mean,emb_std, num_embs, emb_size = \
choose_emb_and_train(embedding_name,balanced_train, maxlen=100)
import gc; gc.collect()

In [None]:
# Get embedding stats
print("mean: %5.5f\nstd: %5.5f\nnumber of embeddings: %d\nembedding vector size:%d" \
      %(emb_mean,emb_std, num_embs, emb_size))

In [None]:
embedding_matrix.shape

#### Create Training and  Validation sets

In [None]:
from sklearn.model_selection import train_test_split

training_set, val_set = train_test_split(balanced_train, test_size=0.1)


### Get training, test, and val set stats

In [None]:
maxlen_train, minlen_train, mean_len_train, std_len_train, median_train = get_set_stats(training_set["question_text"])
print("Question Length Stats in Training set:\n")
print("\tmaximum:%d\n\tminimum:%d\n\tmean:%d\n\tmedian:%d\n\tstd:%d"% \
      (maxlen_train, minlen_train, mean_len_train, median_train, std_len_train))

In [None]:
maxlen_val, minlen_val, mean_len_val, std_len_val, median_val = get_set_stats(val_set["question_text"])
print("Question Length Stats in Validation set:\n")
print("\tmaximum:%d\n\tminimum:%d\n\tmean:%d\n\tmedian:%d\n\tstd:%d"% \
      (maxlen_val, minlen_val, mean_len_val, median_val, std_len_val))

In [None]:
maxlen_test, minlen_test, mean_len_test, std_len_test, median_test = get_set_stats(test["question_text"])
print("Question Length Stats in Test set:\n")
print("\tmaximum:%d\n\tminimum:%d\n\tmean:%d\n\tmedian:%d\n\tstd:%d"% \
      (maxlen_test, minlen_test, mean_len_test, median_test, std_len_test))

### Tokenize, preprocess, and pad training, val, and test sets

In [None]:
# Convert token lists into sequences
train_X, train_Y, keras_tokenizer = preprocess_input_sets(training_set, None, num_words, \
                                                         lower=True,\
                                                         conv_height=True, \
                                                         contractions=True)
val_X, val_Y, _ = preprocess_input_sets(val_set, keras_tokenizer,lower=True,\
                         conv_height=True, contractions=True)


### Build the Network

In [None]:
model_name='./wiki_model.model'

if (os.path.isfile(model_name)):
    model = load(model_name)
else:
    model_p = build_model(embedding_matrix, emb_size = embedding_matrix.shape[1],\
                    max_len=maxlen, voc_size=embedding_matrix.shape[0])
    model.compile(loss='binary_crossentropy',   
                  optimizer='adam', #rmsprop',            
                  metrics=['accuracy'])
# summarize the model
print(model.summary())

### Train the Network

In [None]:
hist = train_network(model_name, model, train_X, train_Y,\
                  val_X, val_Y, \
                  batch_size = 1500, epochs = 10,\
                  monitor='val_loss', mode='min')

In [None]:
print("total preds:%d negative:%d pos:%d\n"% (len(val_Y), np.count_nonzero(val_Y), len(val_Y)-np.count_nonzero(val_Y)))

### Find best threshold cutoff

In [None]:
from sklearn import metrics

wiki_pred_val_Y= model.predict([val_X], batch_size=batch_size, verbose=1)

max_f1 = 0.0
max_thresh = 0.0
for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        curr_f1 = metrics.f1_score(val_Y, (wiki_pred_val_Y>thresh).astype(int))
        if max_f1>curr_f1:
            max_f1 = curr_f1
            max_thresh = thresh
        print("Threshold:%1.2f F1 Score:%5.5f"%(thresh, curr_f1))

## Run on Test

In [None]:
test= pd.read_csv("../input/test.csv")
print("Test shape:", test.shape)

In [None]:
wiki_pred_X, _, _ = preprocess_input_sets(test, keras_tokenizer, test=True)

In [None]:
wiki_pred_Y= model.predict([wiki_pred_X], batch_size=1, verbose=1)

In [None]:
len(wiki_pred_Y[wiki_pred_Y<0.5])

In [None]:
len(wiki_pred_Y[wiki_pred_Y>=0.5])

In [None]:
tmp_pred = (wiki_pred_Y>(0.5)).astype(int)

In [None]:
len(tmp_pred[tmp_pred==0])

### Save Test

In [None]:
len(pred_X)

In [None]:
len(wiki_pred_Y)

In [None]:
len(test["qid"])

In [None]:
test_res= pd.DataFrame({"qid":test["qid"].values})
test_pred = (wiki_pred_Y>0.5).astype(int)
test_res['prediction'] = test_pred
test_res.head()


In [None]:
#test_res.to_csv("wiki_submission.csv", index=False)

## Combine Results

In [None]:
ens_val_pred = (0.33 * wiki_pred_val_Y) + (0.33 * para_pred_val_Y) + (0.33 * pred_val_Y)

max_f1 = 0.0
max_thresh = 0.0
for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        curr_f1 = metrics.f1_score(val_Y, (ens_val_pred>thresh).astype(int))
        if max_f1>curr_f1:
            max_f1 = curr_f1
            max_thresh = thresh
        print("Threshold:%1.2f F1 Score:%5.5f"%(thresh, curr_f1))

In [None]:
ens_test_pred = (0.33 * wiki_pred_Y) + (0.33 * para_pred_Y) + (0.33 * pred_Y)
ens_test_pred = (ens_test_pred>0.35).astype(int)
ens_test_res= pd.DataFrame({"qid":test["qid"].values})
ens_test_res['prediction'] = ens_test_pred
ens_test_res.head()

In [None]:
len(ens_test_pred[ens_test_pred==1])

In [None]:
ens_test_res.to_csv("submission.csv", index=False)

### Acknowledgments

* [http://www.kaggle.com/nikhilroxtomar/playing-with-embeddings-using-lstm-and-cnn](http://www.kaggle.com/nikhilroxtomar/playing-with-embeddings-using-lstm-and-cnn)
* [https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings](http://https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings)
* [https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings](http://https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings)