In [1]:
import sys
sys.path.insert(0, '..')

In [2]:
import json
import numpy as np
import os
import preprocessing.constants as constants
import re
import spacy
import time

from preprocessing.dataset_files_saver import *
from preprocessing.dataset_files_wrapper import *
from preprocessing.file_util import *
from preprocessing.raw_training_data import *
from preprocessing.spacy_util import create_tokenizer
from preprocessing.string_category import *
from preprocessing.vocab import get_vocab
from util.string_util import *

In [3]:
_BOS = "bos"
_EOS = "eos"

_DEBUG_USE_ONLY_FIRST_ARTICLE = False


In [4]:
# Note: Some of the training/dev data seems to be inaccurate. This code
# tries to make sure that at least one of the "qa" options in the acceptable
# answers list is accurate and includes it in the data set.

In [5]:
class TextPosition:
    def __init__(self, start_idx, end_idx):
        self.start_idx = start_idx
        self.end_idx = end_idx

In [6]:
class PassageContext:
    '''Class used to save the tokenization positions in a given passage
       so that the original strings can be used for constructing answer
       spans rather than joining tokenized strings, which isn't 100% correct.
    '''
    def __init__(self, passage_str, word_id_to_text_positions,
        acceptable_gnd_truths):
        self.passage_str = passage_str
        self.word_id_to_text_positions = word_id_to_text_positions
        self.acceptable_gnd_truths = acceptable_gnd_truths

## Train Data, Dev Data 생성의 결과물을 저장할 파일들 생성

In [7]:
data_dir = "../data"
download_dir = "../downloads"
value_idx = 0
question_id = 0
ner_categories = StringCategory()
pos_categories = StringCategory()

In [8]:
constants.TRAIN_FOLDER_NAME

'train'

In [9]:
constants.DEV_FOLDER_NAME

'dev'

In [26]:
train_folder = os.path.join(data_dir, constants.TRAIN_FOLDER_NAME)
dev_folder = os.path.join(data_dir, constants.DEV_FOLDER_NAME)

In [12]:
constants.QUESTION_FILE_PATTERN

'question.%d.npy'

In [13]:
constants.CONTEXT_FILE_PATTERN

'context.%d.npy'

In [14]:
constants.SPAN_FILE_PATTERN

'span.%d.npy'

In [15]:
constants.WORD_IN_QUESTION_FILE_PATTERN

'word_in_question.%d.npy'

In [16]:
constants.WORD_IN_CONTEXT_FILE_PATTERN

'word_in_context.%d.npy'

In [17]:
constants.QUESTION_IDS_FILE_PATTERN

'question_ids.%d.npy'

In [18]:
constants.QUESTION_IDS_TO_GND_TRUTHS_FILE_PATTERN

'question_ids_to_gnd_truths.%d'

In [19]:
constants.CONTEXT_POS_FILE_PATTERN

'context.pos.%d.npy'

In [20]:
constants.QUESTION_POS_FILE_PATTERN

'question.pos.%d.npy'

In [21]:
constants.CONTEXT_NER_FILE_PATTERN

'context.ner.%d.npy'

In [22]:
constants.QUESTION_NER_FILE_PATTERN

'question.ner.%d.npy'

In [23]:
constants.QUESTION_IDS_TO_SQUAD_QUESTION_ID_FILE_PATTERN

'question_ids_to_squad_question_id.%d'

In [24]:
constants.QUESTION_IDS_TO_PASSAGE_CONTEXT_FILE_PATTERN

'passage_context.%d'

In [27]:
train_files_wrapper = DatasetFilesWrapper(train_folder)
dev_files_wrapper = DatasetFilesWrapper(dev_folder)

if all([len(os.listdir(f)) > 0 for f in [train_folder, dev_folder]]):
    print("Train & dev data already exist.")
    #return

Train & dev data already exist.


## Vocabulary 생성

In [30]:
print("Getting vocabulary")
vocab = get_vocab(data_dir)
print("Finished getting vocabulary")


Getting vocabulary
Vocab size: 2196016
Finished getting vocabulary


## Tokenizer 생성

In [31]:
nlp = spacy.load("en")

spacy 는 NLTK 와 같은 토크나이저 라이브러리... 참고 http://yujuwon.tistory.com/entry/spaCy-%EC%82%AC%EC%9A%A9%ED%95%98%EA%B8%B0

spacy 가 가진 장점이 있지만...커버링하지 못하는 부분이 여전히 존재하여...커스터마이징이 필요한 부분이 있음.
spacy_util.py 주석 참고...

    # The following way of definining unicode characters that should be
    # tokenized is super ugly and I would hope that it can be improved.
    # But it is better than not doing it because spacy's tokenizer won't break
    # on these  weird characters when it should.
    # To get this list, I scraped the train dataset for all unicode-looking
    # things.

In [32]:
tokenizer = create_tokenizer(nlp)
nlp.tokenizer = tokenizer


In [33]:
constants.DEV_SQUAD_FILE

'dev-v1.1.json'

## Train Data 생성

In [35]:
"""Returns (contexts, word_in_question, questions, word_in_context, spans)
    contexts: list of lists of integer word ids
    word_in_question: list of lists of booleans indicating whether each
        word in the context is present in the question
    questions: list of lists of integer word ids
    word_in_context: list of lists of booleans indicating whether each
        word in the question is present in the context
    spans: numpy array of shape (num_samples, 2)
    question_ids: a list of ints that indicates which question the
        given sample is part of. this has the same length as
        |contexts| and |questions|. multiple samples may come from
        the same question because there are potentially multiple valid
        answers for the same question
"""

'Returns (contexts, word_in_question, questions, word_in_context, spans)\n    contexts: list of lists of integer word ids\n    word_in_question: list of lists of booleans indicating whether each\n        word in the context is present in the question\n    questions: list of lists of integer word ids\n    word_in_context: list of lists of booleans indicating whether each\n        word in the question is present in the context\n    spans: numpy array of shape (num_samples, 2)\n    question_ids: a list of ints that indicates which question the\n        given sample is part of. this has the same length as\n        |contexts| and |questions|. multiple samples may come from\n        the same question because there are potentially multiple valid\n        answers for the same question\n'

In [81]:
def _parse_data_from_tokens_list(tokens_list, tokens_ner_dict):
    """Input: A spaCy doc.

       Ouptut: (vocab_ids_list, vocab_ids_set, pos_list, ner_list)
    """
    vocab_ids_list = []
    vocab_ids_set = set()
    pos_list = []
    ner_list = []
    for zz in range(len(tokens_list)):
        token = tokens_list[zz]
        vocab_id = None
        token_pos = None
        token_ner = None
        if not isinstance(token, spacy.tokens.token.Token) and token == _BOS:
            vocab_id = vocab.BOS_ID
            token_pos = "bos"
            token_ner = "bos"
        elif not isinstance(token, spacy.tokens.token.Token) and token == _EOS:
            vocab_id = vocab.EOS_ID
            token_pos = "eos"
            token_ner = "eos"
        else:
            word = token.text
            vocab_id = vocab.get_id_for_word(word)
            token_pos = token.pos_
            token_ner = tokens_ner_dict[token.idx].label_ \
                if token.idx in tokens_ner_dict else "none"
            vocab_ids_set.add(vocab_id)
        vocab_ids_list.append(vocab_id)
        pos_list.append(pos_categories.get_id_for_word(token_pos))
        ner_list.append(ner_categories.get_id_for_word(token_ner))
    return vocab_ids_list, vocab_ids_set, pos_list, ner_list

In [82]:
def _maybe_add_samples(value_idx, tok_context=None, tok_question=None, qa=None,
    ctx_offset_dict=None, ctx_end_offset_dict=None, list_contexts=None,
    list_word_in_question=None, list_questions=None,
    list_word_in_context=None, spans=None, num_values=None,
    question_ids=None,
    context_pos=None,
    question_pos=None, context_ner=None, question_ner=None,
    is_dev=None, ctx_ner_dict=None, qst_ner_dict=None,
    psg_ctx=None):
    first_answer = True
    for answer in qa["answers"]:
        answer_start = answer["answer_start"]
        text = answer["text"]
        answer_end = answer_start + len(text)
        tok_start = None
        tok_end = None
        exact_match = answer_start in ctx_offset_dict and answer_end in ctx_end_offset_dict
        if not exact_match:
            # Sometimes, the given answer isn't actually in the context.
            # If so, find the smallest surrounding text instead.
            for z in range(len(tok_context)):
                tok = tok_context[z]
                if not isinstance(tok, spacy.tokens.token.Token):
                    continue
                st = tok.idx
                end = st + len(tok.text)
                if st <= answer_start and answer_start <= end:
                    tok_start = tok
                    if z == len(tok_context) - 2:
                        tok_end = tok
                elif tok_start is not None:
                    tok_end = tok
                    if end >= answer_end:
                        break
        tok_start = tok_start if tok_start is not None else ctx_offset_dict[answer_start]
        tok_end = tok_end if tok_end is not None else ctx_end_offset_dict[answer_end]
        tok_start_idx, tok_end_idx = None, None
        for z in range(len(tok_context)):
            tok = tok_context[z]
            if not isinstance(tok, spacy.tokens.token.Token): # BOS, EOS
                continue
            if tok == tok_start:
                tok_start_idx = z
            if tok == tok_end:
                tok_end_idx = z
            if tok_start_idx is not None and tok_end_idx is not None:
                break
        assert(tok_start_idx is not None)
        assert(tok_end_idx is not None)
        # For dev, only keep one exmaple per question, and the set of all
        # acceptable answers. This reduces the required memory for storing
        # data.
        if is_dev and not first_answer:
            continue
        first_answer = False

        spans.append([tok_start_idx, tok_end_idx])
        question_ids.append(question_id)

        ctx_vocab_ids_list, ctx_vocab_ids_set, \
            ctx_pos_list, ctx_ner_list = \
            _parse_data_from_tokens_list(tok_context, ctx_ner_dict)
        list_contexts.append(ctx_vocab_ids_list)
        context_pos.append(ctx_pos_list)
        context_ner.append(ctx_ner_list)

        qst_vocab_ids_list, qst_vocab_ids_set, \
            qst_pos_list, qst_ner_list = \
            _parse_data_from_tokens_list(tok_question, qst_ner_dict)
        list_questions.append(qst_vocab_ids_list)
        question_pos.append(qst_pos_list)
        question_ner.append(qst_ner_list)

        word_in_question_list = [1 if word_id in qst_vocab_ids_set else 0 for word_id in ctx_vocab_ids_list]
        word_in_context_list = [1 if word_id in ctx_vocab_ids_set else 0 for word_id in qst_vocab_ids_list]
        list_word_in_question.append(word_in_question_list)
        list_word_in_context.append(word_in_context_list)
        print("Value", value_idx, "of", num_values, "percent done",
              100 * float(value_idx) / float(num_values), end="\r")
        value_idx += 1

In [83]:
def _get_num_data_values(dataset):
    numb_values = 0
    for article in dataset:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                numb_values += 1
    return numb_values

In [84]:
def _get_ner_dict(doc):
    d = {}
    for e in doc.ents:
        d[e.start_char] = e
    return d

In [85]:
data_file = constants.DEV_SQUAD_FILE
is_dev = True

In [86]:
filename = os.path.join(download_dir, data_file)
print("Reading data from file", filename)
with open(filename) as data_file: 
    data = json.load(data_file)
    dataset = data["data"]
    print("length of data : " , str(len(dataset)))
    num_values = _get_num_data_values(dataset)
    print("Squad DEV DataSet file()" , filename + ") " , "num_values :" , num_values)
    spans = []
    list_contexts = []
    list_word_in_question = []
    list_questions = []
    list_word_in_context = []
    question_ids = []
    context_pos = []
    question_pos = []
    context_ner = []
    question_ner = []
    question_ids_to_squad_question_id = {}
    question_ids_to_passage_context = {}
    value_idx = 0
    for dataset_id in range(len(dataset)):
        if dataset_id > 0 and _DEBUG_USE_ONLY_FIRST_ARTICLE:
            break
        article = dataset[dataset_id]
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            tok_context = nlp(context)
            tok_contexts_with_bos_and_eos = []
            ctx_ner_dict = _get_ner_dict(tok_context)
            assert tok_context is not None
            ctx_offset_dict = {}
            ctx_end_offset_dict = {}
            word_idx_to_text_position = {}

            word_idx = 0
            for sentence in tok_context.sents:
                tok_contexts_with_bos_and_eos.append(_BOS)
                word_idx_to_text_position[word_idx] = \
                    TextPosition(0, 0)
                word_idx += 1
                for token in sentence:
                    tok_contexts_with_bos_and_eos.append(token)
                    st = token.idx
                    end = token.idx + len(token.text)
                    ctx_offset_dict[st] = token
                    ctx_end_offset_dict[end] = token
                    word_idx_to_text_position[word_idx] = \
                        TextPosition(st, end)
                    word_idx += 1
                tok_contexts_with_bos_and_eos.append(_EOS)
                word_idx_to_text_position[word_idx] = \
                    TextPosition(0, 0)
                word_idx += 1

#                    word_idx = 0
#                    tok_contexts_with_bos_and_eos.append(_BOS)
#                    word_idx_to_text_position[word_idx] = \
#                        TextPosition(0, 0)
#                    word_idx += 1
#                    for token in tok_context:
#                        tok_contexts_with_bos_and_eos.append(token)
#                        st = token.idx
#                        end = token.idx + len(token.text)
#                        ctx_offset_dict[st] = token
#                        ctx_end_offset_dict[end] = token
#                        word_idx_to_text_position[word_idx] = \
#                            TextPosition(st, end)
#                        word_idx += 1
#                    tok_contexts_with_bos_and_eos.append(_EOS)
#                    word_idx_to_text_position[word_idx] = \
#                        TextPosition(0, 0)

            for qa in paragraph["qas"]:
                question_id += 1
                acceptable_gnd_truths = []
                for answer in qa["answers"]:
                    acceptable_gnd_truths.append(answer["text"])
                question_ids_to_passage_context[question_id] = \
                    PassageContext(context, word_idx_to_text_position,
                        acceptable_gnd_truths)
                question = qa["question"]
                squad_question_id = qa["id"]
                assert squad_question_id is not None
                question_ids_to_squad_question_id[question_id] = \
                    squad_question_id
                tok_question = nlp(question)
                tok_question_with_bos_and_eos = []

                for sentence in tok_question.sents:
                    tok_question_with_bos_and_eos.append(_BOS)
                    for token in sentence:
                        tok_question_with_bos_and_eos.append(token)
                    tok_question_with_bos_and_eos.append(_EOS)

#                        tok_question_with_bos_and_eos.append(_BOS)
#                        for token in tok_question:
#                            tok_question_with_bos_and_eos.append(token)
#                        tok_question_with_bos_and_eos.append(_EOS)

                qst_ner_dict = _get_ner_dict(tok_question)
                assert tok_question is not None
                found_answer_in_context = False
                found_answer_in_context = _maybe_add_samples(
                    value_idx,
                    tok_context=tok_contexts_with_bos_and_eos,
                    tok_question=tok_question_with_bos_and_eos, qa=qa,
                    ctx_offset_dict=ctx_offset_dict,
                    ctx_end_offset_dict=ctx_end_offset_dict,
                    list_contexts=list_contexts,
                    list_word_in_question=list_word_in_question,
                    list_questions=list_questions,
                    list_word_in_context=list_word_in_context,
                    spans=spans, num_values=num_values,
                    question_ids=question_ids,
                    context_pos=context_pos, question_pos=question_pos,
                    context_ner=context_ner, question_ner=question_ner,
                    is_dev=is_dev,
                    ctx_ner_dict=ctx_ner_dict,
                    qst_ner_dict=qst_ner_dict,
                    psg_ctx=question_ids_to_passage_context[question_id])
    print("")
    spans = np.array(spans[:value_idx], dtype=np.int32)
""" 
    return RawTrainingData(
        list_contexts = list_contexts,
        list_word_in_question = list_word_in_question,
        list_questions = list_questions,
        list_word_in_context = list_word_in_context,
        spans = spans,
        question_ids = question_ids,
        context_pos = context_pos,
        question_pos = question_pos,
        context_ner = context_ner,
        question_ner = question_ner,
        question_ids_to_squad_question_id = question_ids_to_squad_question_id,
        question_ids_to_passage_context = question_ids_to_passage_context)
"""

Reading data from file ../downloads/dev-v1.1.json
length of data :  48
Squad DEV DataSet file() ../downloads/dev-v1.1.json)  num_values : 10570
Value 0 of 10570 percent done 0.0 of 10570 percent done 0.0 of 10570 percent done 0.0of 10570 percent done 0.00.0


' \n    return RawTrainingData(\n        list_contexts = list_contexts,\n        list_word_in_question = list_word_in_question,\n        list_questions = list_questions,\n        list_word_in_context = list_word_in_context,\n        spans = spans,\n        question_ids = question_ids,\n        context_pos = context_pos,\n        question_pos = question_pos,\n        context_ner = context_ner,\n        question_ner = question_ner,\n        question_ids_to_squad_question_id = question_ids_to_squad_question_id,\n        question_ids_to_passage_context = question_ids_to_passage_context)\n'

In [87]:
print("Getting DEV dataset")
dev_raw_data = self._create_train_data_internal(
    constants.DEV_SQUAD_FILE, is_dev=True)


Getting DEV dataset


NameError: name 'self' is not defined

In [None]:
print("Getting TRAIN dataset")
train_raw_data = self._create_train_data_internal(
    constants.TRAIN_SQUAD_FILE, is_dev=False)
print("Num NER categories", self.ner_categories.get_num_categories())
print("Num POS categories", self.pos_categories.get_num_categories())

max_context_length = max(
        max([len(x) for x in train_raw_data.list_contexts]),
        max([len(x) for x in dev_raw_data.list_contexts]))

max_question_length = max(
        max([len(x) for x in train_raw_data.list_questions]),
        max([len(x) for x in dev_raw_data.list_questions]))

print("Saving TRAIN data")
train_file_saver = DatasetFilesSaver(
        train_files_wrapper,
        max_context_length,
        max_question_length,
        self.vocab,
        train_raw_data)
train_file_saver.save()

print("Saving DEV data")
dev_file_saver = DatasetFilesSaver(
        dev_files_wrapper,
        max_context_length,
        max_question_length,
        self.vocab,
        dev_raw_data)
dev_file_saver.save()

print("Finished creating training data!")