In [1]:
import sys
import utils
import dataset_utils
import os
from tqdm import tqdm_notebook
import random
import nltk
import argparse
import pdb
from glob import glob
import json

In [2]:
qrels_path = glob('/home/exo/triviaqa/qa_m/*dev*')

In [3]:
import gzip
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from sklearn.metrics import pairwise_distances

In [4]:
def get_text(qads):
    results = [] 
    for qad in qads:
        #print(qad['doc_id'])
        local_file = os.path.join("/home/exo/triviaqa/regularized_evidence_NER/", qad['doc_id']+ '.json.gz')
        
        with gzip.open(local_file, 'r') as f:
            text_file = json.load(f)
            
        results.append(text_file)
        
    return results

def text_to_corpus_format(question, texts):
    p_ids = ['query']
    corpus = [question]
    for text in texts:
        did = text['did']
        #print(did)
        for parag in text['document']:
            p_ids.append(did + '-' + parag['pid'])
            tmp = ''
            for sentence in parag['paragrph'][0]:
                #print(sentence['sentence'])
                tmp += sentence['sentence'] + ' '
            #print(parag['pid'],tmp, '\n')
            corpus.append(tmp)
    return p_ids, corpus

def sort_by_csim(X, p_ids):
    csim = 1 - pairwise_distances(X[0], X[1:], 'cosine')[0]
    tmp = []
    for index, pid in enumerate(p_ids[1:]):
        tmp.append((pid, csim[index]))
    return sorted(tmp, key = lambda x:x[1], reverse=True)

# Test on single example

In [5]:
with open(qrels_path[0], 'r') as f:
    qrel_test = json.load(f)

In [6]:
for single_qrel in qrel_test:
    question = single_qrel['Question_m']
    texts = get_text(single_qrel['EntityPages'])
    p_ids, corpus = text_to_corpus_format(question, texts)
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    result = sort_by_csim(X, p_ids)
        
    
    max_token = 350
    tmp = []
    key_for_chunck = ''
    new_text =  ''
    for key, sim in result:
        if len(new_text.split()) > max_token:
            tmp.append((new_text.lower(), key_for_chunck))
            key_for_chunck = ''
            new_text =  ''
        if sim == 0:
            break
        new_text += corpus[p_ids.index(key)]
        key_for_chunck += key
    
    break
single_qrel, tmp

({'Answer': {'Aliases': ['The Swiss Miss',
    'Martina hingis',
    'Martina Hingisová',
    'Martina Hingis',
    'MartinaHingis',
    'Martina Hingisova',
    'Hingis'],
   'MatchedWikiEntityName': 'Martina Hingis',
   'NormalizedAliases': ['hingis',
    'swiss miss',
    'martina hingis',
    'martina hingisova',
    'martinahingis',
    'martina hingisová'],
   'NormalizedMatchedWikiEntityName': 'martina hingis',
   'NormalizedValue': 'martina hingis',
   'Type': 'WikipediaEntity',
   'Value': 'Martina Hingis'},
  'EntityPages': [{'DocSource': 'TagMe',
    'Filename': 'Tennis.txt',
    'Title': 'Tennis',
    'doc_id': 'dwk_042022'}],
  'Question': 'Melanie Molitor is the mom of which tennis world NO 1?',
  'QuestionId': 'tc_1250',
  'QuestionPartOfVerifiedEval': False,
  'QuestionSource': 'http://www.triviacountry.com/',
  'QuestionVerifiedEvalAttempt': True,
  'Question_m': 'Melanie_Molitor is the mom of which tennis world NO 1?'},
 [('* no ad roger_federer is now considered by m

In [7]:
result

[('dwk_042022-p_063', 0.24254413525586016),
 ('dwk_042022-p_203', 0.18017522450793544),
 ('dwk_042022-p_120', 0.15797679875167803),
 ('dwk_042022-p_041', 0.15611124654674002),
 ('dwk_042022-p_045', 0.15231708889924),
 ('dwk_042022-p_011', 0.14999119960962382),
 ('dwk_042022-p_064', 0.14673357504890938),
 ('dwk_042022-p_084', 0.1410195568187932),
 ('dwk_042022-p_002', 0.1399329168993153),
 ('dwk_042022-p_006', 0.13969979324403714),
 ('dwk_042022-p_123', 0.13694982602248196),
 ('dwk_042022-p_044', 0.13570654811464877),
 ('dwk_042022-p_017', 0.13499803228195284),
 ('dwk_042022-p_099', 0.13441470052160898),
 ('dwk_042022-p_007', 0.13006466647254666),
 ('dwk_042022-p_051', 0.12788385112074985),
 ('dwk_042022-p_000', 0.1268334335133645),
 ('dwk_042022-p_001', 0.12570879681432945),
 ('dwk_042022-p_115', 0.12470717801563114),
 ('dwk_042022-p_101', 0.12265876997165615),
 ('dwk_042022-p_104', 0.12175004370120457),
 ('dwk_042022-p_066', 0.12099539946057047),
 ('dwk_042022-p_208', 0.11972113328253

In [8]:
def add_triple_data(datum, page, domain):
    qad = {'Source': domain}
    for key in ['QuestionId', 'Question', 'Question_m', 'Answer']:
        qad[key] = datum[key]
    for key in page:
        qad[key] = page[key]
    return qad


def get_qad_triples(data):
    qad_triples = []
    for datum in data:
        for key in ['EntityPages', 'SearchResults']:
            for page in datum.get(key, []):
                qad = add_triple_data(datum, page, key)
                qad_triples.append(qad)
    return qad_triples


def convert_to_squad_format(qa_json_file, squad_file):
    with open(qa_json_file, 'r') as f:
        qrel_test = json.load(f)
    
    qad_triples = get_qad_triples(qa_json)

    random.seed(args.seed)
    random.shuffle(qad_triples)

    data = []
    for qad in tqdm(qad_triples):
        qid = qad['QuestionId']

        text = get_text(qad, qad['Source'])
        selected_text = select_relevant_portion(text)

        question = qad['Question']
        para = {'context': selected_text, 'qas': [{'question': question, 'answers': []}]}
        data.append({'paragraphs': [para]})
        qa = para['qas'][0]
        qa['id'] = dataset_utils.get_question_doc_string(qid, qad['Filename'])
        qa['qid'] = qid

        ans_string, index = dataset_utils.answer_index_in_document(qad['Answer'], selected_text)
        if index == -1:
            if qa_json['Split'] == 'train':
                continue
        else:
            qa['answers'].append({'text': ans_string, 'answer_start': index})

        if qa_json['Split'] == 'train' and len(data) >= args.sample_size and qa_json['Domain'] == 'Web':
            break

    squad = {'data': data, 'version': qa_json['Version']}
    utils.write_json_to_file(squad, squad_file)
    print ('Added', len(data))

In [9]:
def answer_index_in_document(answer, document):
    answer_list = answer['NormalizedAliases']
    for answer_string_in_doc in answer_list:
        index = document.lower().find(answer_string_in_doc)
        if index != -1:
            return answer_string_in_doc, index
    return answer['NormalizedValue'], -1

In [22]:
def write_json_to_file(json_object, json_file, mode='w', encoding='utf-8'):
    with open(json_file, mode, encoding=encoding) as outfile:
        json.dump(json_object, outfile, indent=4, sort_keys=True, ensure_ascii=False)
        
def convert_form(qrel, save_name):
    data = []
    contain_ans = set()
    squad_with_ans = 0
    for qad in tqdm_notebook(qrel):
        qid = qad['QuestionId']
        #print(qad)

        question = qad['Question_m'].replace('_', ' ')
        
        texts = get_text(qad['EntityPages'])

        p_ids, corpus = text_to_corpus_format(question, texts)

        #print(p_ids)
        vectorizer = TfidfVectorizer()
        X = vectorizer.fit_transform(corpus)
        result = sort_by_csim(X, p_ids)


        max_token = 350
        tmp = []
        key_for_chunck = ''
        new_text =  ''
        for key, sim in result:

            if len(tmp) == 4:
                break

            if len(new_text.split()) > max_token:
                tmp.append((new_text.lower(), key_for_chunck))
                key_for_chunck = ''
                new_text =  ''
    #         if sim == 0:
    #             break

            cur_text = corpus[p_ids.index(key)].replace('_', ' ')
            new_text += cur_text
            key_for_chunck += (key + '/' + str(len(cur_text)) + '/')


        for selected_text, key in tmp:

            para = {'context': selected_text, 'qas': [{'question': question, 'answers': []}]}

            qa = para['qas'][0]
            qa['id'] = key
            qa['qid'] = qid


            ans_string, index = answer_index_in_document(qad['Answer'], selected_text)
            #print(qad['Answer'])
            if index != -1:
                is_impossible = False
                contain_ans.add(qid)
                squad_with_ans += 1 
                qa['answers'].append({'text': ans_string, 'answer_start': index})
            else:
                is_impossible = True

            qa['is_impossible'] = is_impossible

            if len(qa['answers']) > 1:
                print(para)
                raise ValueError("For training, each question should have exactly 1 answer.")

            data.append({'paragraphs': [para]})

        print(len(contain_ans), squad_with_ans, end='\r')

    #     if idx > 10:
    #         break

    squad = {'data': data}
    write_json_to_file(squad, save_name)
    print ('Added', len(data), 'number_of_qid_w_ans', len(contain_ans), "total", len(qrel))
    data[0]

In [11]:
with open('qa_m/wikipedia-train.json', 'r') as f:
    qrel = json.load(f)

In [23]:
convert_form(qrel, 'train-wiki-triviaqa.json')

HBox(children=(IntProgress(value=0, max=61888), HTML(value='')))

53692 129792027948
Added 224962 number_of_qid_w_ans 53692 total 61888


In [12]:
from tqdm import tqdm_notebook

In [18]:
with open('qa_m/wikipedia-dev.json', 'r') as f:
    qrel_dev = json.load(f)

In [21]:
convert_form(qrel_dev, 'dev-wiki-triviaqa.json')

HBox(children=(IntProgress(value=0, max=7993), HTML(value='')))

6908 16815
Added 28980 number_of_qid_w_ans 6908 total 7993


# 기본적으로 triviaQA -> squad formatting 으로 바꾸는 코드

1. Paragraph Selection에서 적용된 paragph란? 실제 문서 내 문단들의 묶음 (maximum 350단어)
2. Paragraph Selection Criteria? DocQA 처럼 질의와의 TF-IDF(질의 + 전체 문서를 global corpus 라고 가정하고 idf 계산) vector를 기준으로 cosine similarity가 가장 큰 4개의 paragph를 사용 (DocQA는 이렇게 4개를 sampling해서 각 epoch마다 랜덤하게 2개의 paragraph를 합쳐서 학습, 우린 장비상의 한게로 개별 학습)
3. Answer가 없는 경우는? 이경우도 training set에 포함했음 (DocQA에 따르면 정답이 없는 경우가 training에 도움이 된다고 함)

# 현재 필터링 방식 기준 
* 전체 중 정답이 있는 qid 비율 : 86.8% (53692/61888)
* 전체 데이터 중 정답이 있는 비율: 57.7% (129792/247552)

In [14]:
a = 'qa_m/wikipedia-dev.json'

In [15]:
os.path.dirname(a)

'qa_m'