In [1]:
import os
import json
import gc
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm


In [2]:
import os
import json
import gc
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm

def build_train(train_path, n_rows=200000, sampling_rate=15):
    with open(train_path) as f:
        processed_rows = []

        for i in tqdm(range(n_rows)):
            line = f.readline()
            if not line:
                break

            line = json.loads(line)

            text = line['document_text'].split(' ')
            question = line['question_text']
            annotations = line['annotations'][0]

            for i, candidate in enumerate(line['long_answer_candidates']):
                label = i == annotations['long_answer']['candidate_index']

                start = candidate['start_token']
                end = candidate['end_token']

                if label or (i % sampling_rate == 0):
                    processed_rows.append({
                        'text': " ".join(text[start:end]),
                        'is_long_answer': label,
                        'question': question,
                        'annotation_id': annotations['annotation_id']
                    })

        train = pd.DataFrame(processed_rows)
        
        return train
    
def build_test(test_path):
    with open(test_path) as f:
        processed_rows = []

        for line in tqdm(f):
            line = json.loads(line)

            text = line['document_text'].split(' ')
            question = line['question_text']
            example_id = line['example_id']

            for candidate in line['long_answer_candidates']:
                start = candidate['start_token']
                end = candidate['end_token']

                processed_rows.append({
                    'text': " ".join(text[start:end]),
                    'question': question,
                    'example_id': example_id,
                    'sequence': f'{start}:{end}'

                })

        test = pd.DataFrame(processed_rows)
    
    return test


directory = '../input/'
train_path = directory + 'simplified-nq-train.jsonl'
test_path = directory + 'simplified-nq-test.jsonl'

train = build_train(train_path)
test = build_test(test_path)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, max=200000), HTML(value='')))




Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [3]:
train.head(10)

Unnamed: 0,text,is_long_answer,question,annotation_id
0,<Table> <Tr> <Td> </Td> <Td> ( hide ) This art...,False,which is the most common use of opt-in e-mail ...,593165450220027640
1,<Tr> <Td> <Ul> <Li> Pay - per - click </Li> <L...,False,which is the most common use of opt-in e-mail ...,593165450220027640
2,<P> Email marketing has evolved rapidly alongs...,False,which is the most common use of opt-in e-mail ...,593165450220027640
3,<Li> Advertisers can reach substantial numbers...,False,which is the most common use of opt-in e-mail ...,593165450220027640
4,<P> A common example of permission marketing i...,True,which is the most common use of opt-in e-mail ...,593165450220027640
5,<P> The CAN - SPAM Act of 2003 was passed by C...,False,which is the most common use of opt-in e-mail ...,593165450220027640
6,"<Table> <Tr> <Th_colspan=""2""> Tracy McConnell ...",False,how i.met your mother who is the mother,12034874153783787365
7,"<P> Tracy McConnell , better known as `` The M...",True,how i.met your mother who is the mother,12034874153783787365
8,"<P> In `` Bass Player Wanted '' , the Mother p...",False,how i.met your mother who is the mother,12034874153783787365
9,<Table> <Tr> <Td> Part of a series on </Td> </...,False,what type of fertilisation takes place in humans,10527123009892725162


In [4]:
from tensorflow.keras.backend import clear_session
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate, Masking
from tensorflow.keras.layers import LSTM, Bidirectional, GlobalMaxPooling1D, Dropout
from tensorflow.keras.preprocessing import text, sequence
from tqdm import tqdm_notebook as tqdm
import fasttext

# 数据处理

Tokenize

In [5]:
def compute_text_and_questions(train, test, tokenizer):
    train_text = tokenizer.texts_to_sequences(train.text.values)
    train_questions = tokenizer.texts_to_sequences(train.question.values)
    test_text = tokenizer.texts_to_sequences(test.text.values)
    test_questions = tokenizer.texts_to_sequences(test.question.values)
    
    train_text = sequence.pad_sequences(train_text, maxlen=500)
    train_questions = sequence.pad_sequences(train_questions)
    test_text = sequence.pad_sequences(test_text, maxlen=500)
    test_questions = sequence.pad_sequences(test_questions)
    
    return train_text, train_questions, test_text, test_questions

## 训练出一个Tokenizer

In [6]:
tokenizer = text.Tokenizer(lower=False, num_words=80000) #取前80000个单词

for text in tqdm([train.text, test.text, train.question, test.question]):
    tokenizer.fit_on_texts(text.values)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [7]:
train_target = train.is_long_answer.astype(int).values

In [8]:
train_text, train_questions, test_text, test_questions = compute_text_and_questions(train, test, tokenizer)
del train

# 词向量

这里使用的是fasttext做权重，其他的也可以
https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip

In [9]:
path = directory+'crawl-300d-2M-subword.bin'


In [10]:
def build_embedding_matrix(tokenizer, path):
    embedding_matrix = np.zeros((tokenizer.num_words + 1, 300))
    ft_model = fasttext.load_model(path)

    for word, i in tokenizer.word_index.items():
        if i >= tokenizer.num_words - 1:
            break
        embedding_matrix[i] = ft_model.get_word_vector(word)
    
    return embedding_matrix

In [11]:
embedding_matrix = build_embedding_matrix(tokenizer, path)




In [12]:
def build_model(embedding_matrix):
    embedding = Embedding(
        *embedding_matrix.shape, # 这里相当于输入了一个元组，因为inputdim需要指定。a=(1,2,2)  fun(*a) 相当于f((1,2,3))
        weights=[embedding_matrix], 
        trainable=False,
        mask_zero=True
    )
    
    q_in = Input(shape=(None,))
    q = embedding(q_in)
    q = SpatialDropout1D(0.2)(q)
    q = Bidirectional(LSTM(100, return_sequences=True))(q)
    q = GlobalMaxPooling1D()(q)
    
    
    t_in = Input(shape=(None,))
    t = embedding(t_in)
    t = SpatialDropout1D(0.2)(t)
    t = Bidirectional(LSTM(150, return_sequences=True))(t)
    t = GlobalMaxPooling1D()(t)
    
    hidden = concatenate([q, t])
    hidden = Dense(300, activation='relu')(hidden)
    hidden = Dropout(0.5)(hidden)
    hidden = Dense(300, activation='relu')(hidden)
    hidden = Dropout(0.5)(hidden)
    
    out1 = Dense(1, activation='sigmoid')(hidden)
    
    model = Model(inputs=[t_in, q_in], outputs=out1)
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

    return model

In [23]:
!pip install keras
!pip install scikit-learn
!pip install pydot

Collecting pydot
  Using cached https://files.pythonhosted.org/packages/33/d1/b1479a770f66d962f545c2101630ce1d5592d90cb4f083d38862e93d16d2/pydot-1.4.1-py2.py3-none-any.whl
Collecting pyparsing>=2.1.4
[?25l  Downloading https://files.pythonhosted.org/packages/c0/0c/fc2e007d9a992d997f04a80125b0f183da7fb554f1de701bbb70a8e7d479/pyparsing-2.4.5-py2.py3-none-any.whl (67kB)
[K     |████████████████████████████████| 71kB 7.5MB/s  eta 0:00:01
[?25hInstalling collected packages: pyparsing, pydot
Successfully installed pydot-1.4.1 pyparsing-2.4.5


In [25]:
import keras
import pydot as pyd
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
keras.utils.vis_utils.pydot = pyd
def visualize_model(model):
  return SVG(model_to_dot(model).create(prog='dot', format='svg'))

In [31]:
def get_model():
    model = build_model(embedding_matrix)
    model.summary()
    return model

In [32]:
# train_history = model.fit(
#     [train_text, train_questions], 
#     train_target,
#     epochs=2,
#     validation_split=0.02,
#     batch_size=1024
# )

In [33]:
from sklearn.model_selection import StratifiedKFold

In [34]:
n_splites=5
skf = StratifiedKFold(n_splits=n_splites,random_state=666,shuffle=True)

In [None]:
test_result=np.zeros(len(test))
for train_index, _ in skf.split(train_text, train_target):
    ans_,qus_=train_text[train_index],train_questions[train_index]
    label = train_target[train_index]
    model=get_model()
    train_history = model.fit(
        [ans_, qus_], 
        label,
        epochs=2,
        validation_split=0.02,
        batch_size=1024
    )
    re=model.predict([test_text,test_questions])
    test_result+=re
    cleal_session()
    

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, None, 300)    24000300    input_3[0][0]                    
                                                                 input_4[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_2 (SpatialDro (None, None, 300)    0           embedding_1[0][0]          