In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# for colab
# !pip install tensorflow-gpu==2.1.0-rc0

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import tensorflow_hub as hub
import tensorflow as tf
# bert_tokenizeration is a script for bert tokenizer from tensorflow
import bert_tokenization as tokenization
import tensorflow.keras.backend as K
import gc
import os
from scipy.stats import spearmanr
from math import floor, ceil
from tensorflow.keras.models import load_model

np.set_printoptions(suppress=True)


In [3]:
PATH = '../input/google-quest-challenge/'
BERT_PATH = '../input/bert-uncased/'
tokenizer = tokenization.FullTokenizer('../input/bert-uncased/assets/vocab.txt', True)
MAX_SEQUENCE_LENGTH = 512

In [4]:
df_train = pd.read_csv(PATH+'train.csv')
df_test = pd.read_csv(PATH+'test.csv')
df_sub = pd.read_csv(PATH+'sample_submission.csv')
df_dummy = pd.read_csv(PATH+'sample_submission.csv')

In [5]:
output_categories = list(df_train.columns[11:])
input_categories = list(df_train.columns[[1,2,5]])

# Bert Inputs

In [None]:
## get mask for sequence
def _get_masks(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))

## get segments to seperate question and answer
def _get_segments(tokens, max_seq_length):
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    first_sep = True
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        ## we have 2 [SEP] tokens before answer so the first one we ignored
        if token == "[SEP]":
            if first_sep:
                first_sep = False 
            else:
                current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))

## get id from tokenizer for each tokens
def _get_ids(tokens, tokenizer, max_seq_length):
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

# Preprocessing

In [6]:
title_len = 30
question_len = 239
answer_len = 239


def _trim_input(title, question, answer, max_sequence_length, 
                t_max_len=title_len, q_max_len=question_len, a_max_len=answer_len):

    t = tokenizer.tokenize(title)
    q = tokenizer.tokenize(question)
    a = tokenizer.tokenize(answer)
    
    t_len = len(t)
    q_len = len(q)
    a_len = len(a)

    if (t_len+q_len+a_len+4) > max_sequence_length:
        
        if t_max_len > t_len:
            t_new_len = t_len
            a_max_len = a_max_len + floor((t_max_len - t_len)/2)
            q_max_len = q_max_len + ceil((t_max_len - t_len)/2)
        else:
            t_new_len = t_max_len
      
        if a_max_len > a_len:
            a_new_len = a_len 
            q_new_len = q_max_len + (a_max_len - a_len)
        elif q_max_len > q_len:
            a_new_len = a_max_len + (q_max_len - q_len)
            q_new_len = q_len
        else:
            a_new_len = a_max_len
            q_new_len = q_max_len
            
            
        if t_new_len+a_new_len+q_new_len+4 != max_sequence_length:
            raise ValueError("New sequence length should be %d, but is %d" 
                             % (max_sequence_length, (t_new_len+a_new_len+q_new_len+4)))
        
        t = t[:t_new_len]
        q = q[:q_new_len]
        a = a[:a_new_len]
    
    return t, q, a

def _convert_to_bert_inputs(title, question, answer, tokenizer, max_sequence_length):
    ## we combine all title, question and answer into 1 sequence to fit into the model
    stoken = ["[CLS]"] + title + ["[SEP]"] + question + ["[SEP]"] + answer + ["[SEP]"]

    input_ids = _get_ids(stoken, tokenizer, max_sequence_length)
    input_masks = _get_masks(stoken, max_sequence_length)
    input_segments = _get_segments(stoken, max_sequence_length)

    return [input_ids, input_masks, input_segments]

def compute_input_arays(df, columns, tokenizer, max_sequence_length):
    input_ids, input_masks, input_segments = [], [], []
    for _, instance in tqdm(df[columns].iterrows()):
        t, q, a = instance.question_title, instance.question_body, instance.answer

        t, q, a = _trim_input(t, q, a, max_sequence_length)

        ids, masks, segments = _convert_to_bert_inputs(t, q, a, tokenizer, max_sequence_length)
        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)
        
    return [np.asarray(input_ids, dtype=np.int32), 
            np.asarray(input_masks, dtype=np.int32), 
            np.asarray(input_segments, dtype=np.int32)]


def compute_output_arrays(df, columns):
    return np.asarray(df[columns])

In [7]:
def get_spearman(y, y_pred, numical):
    if numical != 0:
        y = np.round(y, numical)
        y_pred = np.round(y_pred, numical)
    spearsum = 0
    cnt = 0 
    for col in range(y_pred.shape[1]):
        v = spearmanr(y_pred[:,col], y[:,col]).correlation
        if np.isnan(v):
            continue
        spearsum += v
        cnt += 1
    res = spearsum / cnt
    return res


class CustomCallback(tf.keras.callbacks.Callback):
    
    def __init__(self, valid_data, batch_size=16, fold=None):

        self.valid_inputs = valid_data[0]
        self.valid_outputs = valid_data[1]
        
        self.batch_size = batch_size
        self.fold = fold
        
    def on_train_begin(self, logs={}):
        self.valid_predictions = []
        
    def on_epoch_end(self, epoch, logs={}):
        self.valid_predictions.append(
            self.model.predict(self.valid_inputs, batch_size=self.batch_size))
        
        rho_val = get_spearman(
            self.valid_outputs, np.average(self.valid_predictions, axis=0), 0)
        print("\n 0 validation rho: %.4f" % rho_val)
        rho_val = get_spearman(
            self.valid_outputs, np.average(self.valid_predictions, axis=0), 1)
        print("\n 1 validation rho: %.4f" % rho_val)
        rho_val = get_spearman(
          self.valid_outputs, np.average(self.valid_predictions, axis=0), 2)
        print("\n 2 validation rho: %.4f" % rho_val)
        rho_val = get_spearman(
            self.valid_outputs, np.average(self.valid_predictions, axis=0), 3)
        print("\n 3 validation rho: %.4f" % rho_val)
        

def bert_model():
    
    input_word_ids = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_word_ids')
    input_masks = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_masks')
    input_segments = tf.keras.layers.Input(
        (MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_segments')
    
    bert_layer = hub.KerasLayer(BERT_PATH, trainable=True)
    
    _, sequence_output = bert_layer([input_word_ids, input_masks, input_segments])
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(768, return_sequences=True))(sequence_output)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(768))(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    out = tf.keras.layers.Dense(30, activation="sigmoid", name="dense_output")(x)

    model = tf.keras.models.Model(
        inputs=[input_word_ids, input_masks, input_segments], outputs=out)
    
    return model     
        
def train_and_predict(model, train_data, valid_data, 
                      learning_rate, epochs, batch_size, loss_function, fold):
        
    custom_callback = CustomCallback(
        valid_data=(valid_data[0], valid_data[1]), 
        batch_size=batch_size,
        fold=None)

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(loss=loss_function, optimizer=optimizer)
    model.fit(train_data[0], train_data[1], epochs=epochs, 
              batch_size=batch_size, callbacks=[custom_callback])
    
    return custom_callback

# GroupKFold
GroupKFold 10 Fold split by `question_body` so that we don't split the same `question_body` into validation set to prevent information leaked.

In [8]:
gkf = GroupKFold(n_splits=10).split(X=df_train.question_body, groups=df_train.question_body)

outputs = compute_output_arrays(df_train, output_categories)
inputs = compute_input_arays(df_train, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)
test_inputs = compute_input_arays(df_test, input_categories, tokenizer, MAX_SEQUENCE_LENGTH)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




# Training

In [9]:
for fold, (train_idx, valid_idx) in enumerate(gkf):
    
    K.clear_session()
    model = bert_model()

    train_inputs = [inputs[i][train_idx] for i in range(3)]
    train_outputs = outputs[train_idx]

    valid_inputs = [inputs[i][valid_idx] for i in range(3)]
    valid_outputs = outputs[valid_idx]

    history = train_and_predict(model, 
                      train_data=(train_inputs, train_outputs), 
                      valid_data=(valid_inputs, valid_outputs),
                      learning_rate=3e-5, epochs=5, batch_size=8,
                      loss_function='binary_crossentropy', fold=fold)
    model.save_weights('bert_model_bilstm_fold_{i}.h5')

# Inference

In [10]:
models = []

for i in range(0,10):
    model = bert_model()
    model.load_weights(f'../input/bert-10-fold/bert_model_bilstm_fold_{i}.h5')
    models.append(model)

In [None]:
test_predictions = []
for model in models:
    K.clear_session()
    test_predictions.append(model.predict(test_inputs, batch_size=8, verbose=0))

In [None]:
gc.collect()

In [None]:
df_dummy.iloc[:, 1:] = final_predictions
df_sub.iloc[:, 1:] = final_predictions

# PostProcessing

any columns with less than 5 unique values will be rounded off to 2 decimal points else the rest will be rounded off to 1 decimal points

In [13]:
df_dummy.iloc[:, 1:] = df_dummy.iloc[:, 1:].apply(lambda x: np.round(x, 1))
single_values_col = []
for cols in output_categories:
    unique_values = df_dummy[cols].unique()
    if len(unique_values) <= 4:
        single_values_col.append(cols)

In [15]:
for cols in output_categories:
    if cols in single_values_col:
        df_sub[cols] = df_sub[cols].apply(lambda x: np.round(x, 2))
    else:
        df_sub[cols] = df_sub[cols].apply(lambda x: np.round(x, 1))

In [None]:
single_values_col = []
for cols in output_categories:
    unique_values = df_sub[cols].unique()
    if len(unique_values) == 1:
        single_values_col.append(cols)
len(single_values_col)

In [None]:
df_sub.to_csv('submission.csv', index=False)
df_sub.head()

Public Score 
0.41789

Private Score
0.38495