In [1]:
# https://www.kaggle.com/xhlulu/tf2-qa-lstm-for-long-answers-predictions

In [2]:
import os
import json
import gc
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Masking
from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, Dropout
from tensorflow.keras.preprocessing import text, sequence
from tqdm import tqdm_notebook as tqdm
import fasttext
from tensorflow.keras.models import load_model

# Functions

In [3]:
def build_train(train_path, n_rows=200000, sampling_rate=15):
    with open(train_path) as f:
        processed_rows = []

        for i in tqdm(range(n_rows)):
            line = f.readline()
            if not line:
                break

            line = json.loads(line)

            text = line['document_text'].split(' ')
            question = line['question_text']
            annotations = line['annotations'][0]

            for i, candidate in enumerate(line['long_answer_candidates']):
                label = i == annotations['long_answer']['candidate_index']

                start = candidate['start_token']
                end = candidate['end_token']

                if label or (i % sampling_rate == 0):
                    processed_rows.append({
                        'text': " ".join(text[start:end]),
                        'is_long_answer': label,
                        'question': question,
                        'annotation_id': annotations['annotation_id']
                    })

        train = pd.DataFrame(processed_rows)
        
        return train

In [4]:
def build_test(test_path):
    with open(test_path) as f:
        processed_rows = []

        for line in tqdm(f):
            line = json.loads(line)

            text = line['document_text'].split(' ')
            question = line['question_text']
            example_id = line['example_id']

            for candidate in line['long_answer_candidates']:
                start = candidate['start_token']
                end = candidate['end_token']

                processed_rows.append({
                    'text': " ".join(text[start:end]),
                    'question': question,
                    'example_id': example_id,
                    'sequence': f'{start}:{end}'

                })

        test = pd.DataFrame(processed_rows)
    
    return test

In [5]:
def compute_text_and_questions(train, test, tokenizer):
    train_text = tokenizer.texts_to_sequences(train.text.values)
    train_questions = tokenizer.texts_to_sequences(train.question.values)
    test_text = tokenizer.texts_to_sequences(test.text.values)
    test_questions = tokenizer.texts_to_sequences(test.question.values)
    
    train_text = sequence.pad_sequences(train_text, maxlen=300)
    train_questions = sequence.pad_sequences(train_questions)
    test_text = sequence.pad_sequences(test_text, maxlen=300)
    test_questions = sequence.pad_sequences(test_questions)
    
    return train_text, train_questions, test_text, test_questions

In [6]:
def build_embedding_matrix(tokenizer, path):
    embedding_matrix = np.zeros((tokenizer.num_words + 1, 300))
    ft_model = fasttext.load_model(path)

    for word, i in tokenizer.word_index.items():
        if i >= tokenizer.num_words - 1:
            break
        embedding_matrix[i] = ft_model.get_word_vector(word)
    
    return embedding_matrix

In [7]:
def build_model(embedding_matrix):
    embedding = Embedding(
        *embedding_matrix.shape, 
        weights=[embedding_matrix], 
        trainable=False, 
        mask_zero=True
    )
    
    q_in = Input(shape=(None,))
    q = embedding(q_in)
    q = SpatialDropout1D(0.2)(q)
    q = Bidirectional(LSTM(100, return_sequences=True))(q)
    q = GlobalMaxPooling1D()(q)
    
    
    t_in = Input(shape=(None,))
    t = embedding(t_in)
    t = SpatialDropout1D(0.2)(t)
    t = Bidirectional(LSTM(150, return_sequences=True))(t)
    t = GlobalMaxPooling1D()(t)
    
    hidden = concatenate([q, t])
    hidden = Dense(300, activation='relu')(hidden)
    hidden = Dropout(0.5)(hidden)
    hidden = Dense(300, activation='relu')(hidden)
    hidden = Dropout(0.5)(hidden)
    
    out1 = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[t_in, q_in], outputs=out1)
    model.compile(loss='binary_crossentropy', optimizer='adam')

    return model

In [8]:
train = build_train('../input/tensorflow2-question-answering/simplified-nq-train.jsonl')
test = build_test('../input/tensorflow2-question-answering/simplified-nq-test.jsonl')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=0.0, max=200000.0), HTML(value='')))




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




# Preprocessing

In [9]:
tokenizer = text.Tokenizer(lower=False, num_words=80000)

for text in tqdm([train.text, test.text, train.question, test.question]):
    tokenizer.fit_on_texts(text.values)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [10]:
train_target = train.is_long_answer.astype(int).values

In [11]:
train_text, train_questions, test_text, test_questions = compute_text_and_questions(train, test, tokenizer)
del train

# Modelling

In [12]:
path = '/kaggle/input/fasttext-crawl-300d-2m-with-subword/crawl-300d-2m-subword/crawl-300d-2M-subword.bin'
embedding_matrix = build_embedding_matrix(tokenizer, path)



In [13]:
model = build_model(embedding_matrix)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    24000300    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, None, 300)    0           embedding[0][0]       

In [14]:
# train_history = model.fit(
#     [train_text, train_questions], 
#     train_target,
#     epochs=2,
#     validation_split=0.2,
#     batch_size=1024
# )

# Save Model

In [15]:
# # saving
# with open('tokenizer.pickle', 'wb') as handle:
#     pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# model.save('model.h5')

# Testing

In [16]:
directory = '/kaggle/input/tensorflow2-question-answering/'
test_path = directory + 'simplified-nq-test.jsonl'
test = build_test(test_path)
submission = pd.read_csv("../input/tensorflow2-question-answering/sample_submission.csv")

test.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




Unnamed: 0,text,question,example_id,sequence
0,"<Table> <Tr> <Th_colspan=""2""> High Commission ...",who is the south african high commissioner in ...,-1220107454853145579,18:136
1,"<Tr> <Th_colspan=""2""> High Commission of South...",who is the south african high commissioner in ...,-1220107454853145579,19:30
2,<Tr> <Th> Location </Th> <Td> Trafalgar Square...,who is the south african high commissioner in ...,-1220107454853145579,34:45
3,<Tr> <Th> Address </Th> <Td> Trafalgar Square ...,who is the south african high commissioner in ...,-1220107454853145579,45:59
4,<Tr> <Th> Coordinates </Th> <Td> 51 ° 30 ′ 30 ...,who is the south african high commissioner in ...,-1220107454853145579,59:126


In [17]:
# Load text and tokenizer

model = load_model('../input/temporarydata/model.h5')
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 300)    24000300    input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d (SpatialDropo (None, None, 300)    0           embedding[0][0]       

In [18]:
with open('../input/temporarydata/tokenizer.pickle', 'rb') as f:
    tokenizer = pickle.load(f)

In [19]:
def compute_text_and_questions2(test, tokenizer):
    test_text = tokenizer.texts_to_sequences(test.text.values)
    test_questions = tokenizer.texts_to_sequences(test.question.values)
    
    test_text = sequence.pad_sequences(test_text, maxlen=300)
    test_questions = sequence.pad_sequences(test_questions)
    
    return test_text, test_questions

In [20]:
test_text, test_questions = compute_text_and_questions2(test, tokenizer)

In [21]:
del test['text']
del test['question']


In [22]:
%%time
test_target = model.predict([test_text, test_questions], batch_size=512)

CPU times: user 26.8 s, sys: 3.62 s, total: 30.4 s
Wall time: 22.8 s


In [23]:
test['target'] = test_target

result = (
    test.query('target > 0.3')
    .groupby('example_id')
    .max()
    .reset_index()
#    .loc[:, ['example_id', 'PredictionString']]
)

result.head()


Unnamed: 0,example_id,sequence,target
0,-1074129516932871805,2604:2734,0.384987
1,-1114334749483663139,744:3809,0.637711
2,-1152268629614456016,317:367,0.32382
3,-1220107454853145579,141:211,0.666362
4,-1316307078555615068,236:320,0.434887


In [24]:
result = pd.concat([
    result.assign(example_id=lambda example_id: example_id + '_long'),
    result.assign(example_id=lambda example_id: example_id + '_short')
])

UFuncTypeError: ufunc 'add' did not contain a loop with signature matching types (dtype('<U32'), dtype('<U32')) -> dtype('<U32')

In [25]:
result

Unnamed: 0,example_id,sequence,target
0,-1074129516932871805,2604:2734,0.384987
1,-1114334749483663139,744:3809,0.637711
2,-1152268629614456016,317:367,0.323820
3,-1220107454853145579,141:211,0.666362
4,-1316307078555615068,236:320,0.434887
...,...,...,...
217,9204032098950736962,936:1551,0.533207
218,9212083134098244596,84:138,0.365736
219,930196817123445627,3063:6474,0.519008
220,934950704129184964,6746:7197,0.666879
