In [6]:
import json
import requests
from tqdm import tqdm_notebook as tqdm
from scipy.special import softmax
import re
import time
import sys
import math
sys.path.append("..")
from scripts import evaluate_answer

In [7]:
headers = {
    "Content-Type": "application/json",
    'accept': 'application/json'
}

In [8]:
def json_load(name):
    with open(f'{name}', 'r', encoding = 'utf-8') as f:
        return json.load(f)
    
def json_save(name, item):
    with open(f'{name}', 'w', encoding = 'utf-8') as f:
        json.dump(item, f, ensure_ascii = False, indent = 4)
        
def read_vanilla(name):
    with open(name) as f:
        data_tmp = f.readlines()

    data = list()
    for q in data_tmp:
        data.append(json.loads(q))
        
    return data

In [26]:
# vanilla test dataset
test = read_vanilla("../data/VANILLA/Extended_Dataset_Test.json")
# responses for the test dataset from QAnswer (the order of SPARQLs is not changed)
responses = json_load("../processed_data/VANILLA/qanswer_test_responses_extended-0-7000.json") # first thousand
# labels, obtained for each response of QAnswer (see file above)
labels = json_load("../processed_data/VANILLA/qanswer_test_responses_labels-tmp.json")
# contains if a particular SPARQL candidate is True (correct) or not
is_true = json_load("../processed_data/VANILLA/is_true-tmp.json")
# data built based on AlGA
alga_responses = json_load("../processed_data/VANILLA/vanilla_5000_evaluate_labels-int.json")
# full test provided by AlGa
full_test = json_load("../processed_data/VANILLA/full_test_evaluate_labels-int.json")

# ===========

In [372]:
eval_5k = json_load("../processed_data/VANILLA/vanilla_5000_evaluate.json")
# eval_5k = json_load("../processed_data/VANILLA/vanilla-5000-eval.json")

In [19]:
"""test_full = list()
for d in all_test:
    response = list()
    for candidate in d['SPARQL list']:
        candidate_precision = 0 if not candidate['response'] or len(candidate['response']) == 0 else int(candidate['evaluate'][0]/candidate['evaluate'][1])
        is_true = False if candidate_precision < 0.5 else True
        response.append({'is_true': is_true})
        
    item = {
        'question': d['question'],
        'quesiton_id': d['question_id'],
        'response': response
    }
    test_full.append(item)"""

In [369]:
"""alga_responses = list()
for d in eval_5k:
    item = {
        'quesiton_id': d['question_id'],
        'response': [{'query': i['SPARQL'], 'is_true': i['eval']} for i in d['SPARQLs']]
    }
    alga_responses.append(item)"""

In [211]:
eval_5k = json_load("../processed_data/VANILLA/vanilla-5000-eval.json")

In [5]:
len(responses), len(labels), len(is_true) 

(7000, 5516, 6897)

# =============

## The **files above** were prepared as follows:

* `test` -- default test split of VANILLA
* `responses` -- at first run `scripts/vanilla_qanswer.py`, then run `scripts/vanilla_run_sparql_candidates_on_wikidata.py`
* `labels` -- run `scripts/get_vanilla_labels_wikidata.py`
* `is_true` -- run `scripts/vanilla_get_right_answers.py`

In [28]:
def precision_at_k(data, k=1):
    """
    How many relevant items are present in the top-k recommendations of the system
    """
    assert k > 0
    prec = list()
    
    for q in data: # for each question
        cnt = 0
        for i in range(len(q['response'][:k])): # for each response in question
            if q['response'][i]['is_true']:
                if i < k: # count only first k responses
                    cnt += 1
                    
        prec.append(cnt/k)
        
    return sum(prec)/len(prec) # average precision

In [29]:
def ndcg_at_k(data, k=1):
    """
    Normalized Discounted Cumulative Gain
    """
    assert k > 0
    
    ndcg = list()
    for q in data: # for each question
        dcg, idcg = list(), list()
        for i in range(len(q['response'][:k])): # for each response in question
            rel_i = 2**int(q['response'][i]['is_true'])
            log_i = math.log2(i + 1 + 1) # start with i = 1, not with 0
            dcg_i = (rel_i - 1)/log_i
            dcg.append(dcg_i)
        
        rel_cnt = 0 # counter of relevant docs
        for i in range(len(q['response'])):
            if q['response'][i]['is_true'] and rel_cnt < k: # only over a set of relevant docs limited @ k
                log_i = math.log2(rel_cnt + 1 + 1) # start with rel_cnt = 1, not with 0
                idcg_i = (2**int(q['response'][i]['is_true']) - 1)/log_i
                rel_cnt += 1
                idcg.append(idcg_i)
    
        if len(idcg) == 0:
            ndcg.append(0)
        else:
            ndcg.append(sum(dcg)/sum(idcg))

    return sum(ndcg)/len(ndcg)

In [23]:
"""def find_by_id(id_, data):
    for d in data:
        if d['question_id'] == id_:
            return d
        
    assert False"""

In [309]:
"""# unify data
for i in range(len(responses[:5000])):
    is_true_i = find_by_id(responses[i]['question_id'], is_true)
    
    assert responses[i]['question_id'] == is_true_i['question_id']
    
    for j in range(len(responses[i]['response'])):
        try:
            responses[i]['response'][j]['is_true'] = is_true_i['answer_list'][j]
        except:
            responses[i]['response'][j]['is_true'] = False"""

In [41]:
precision_at_k(full_test[:50], 1), precision_at_k(full_test[:50], 5)

(0.22, 0.11200000000000004)

In [42]:
ndcg_at_k(full_test[:50], 1), ndcg_at_k(full_test[:50], 5)

(0.22, 0.3026036483450379)

In [36]:
# filter answer candidates and create new dataset for evaluation
# qanswer_results_filtered = list()

for q in tqdm(full_test[len(qanswer_results_filtered):]): # iterate over questions
    # assert responses[i]['question_id'] == test[i]['question_id'] and test[i]['question_id'] == labels[i]['question_id']
    
    question_text = q['question']
    # START: here goes the code used to predict is answer valid or not
    batch = list()
    for j in range(len(q['response'])):
        answer_text = q['response'][j]['label']
        batch.append([question_text, answer_text])
    
    data = json.dumps(batch, ensure_ascii=False)
    data = data.encode('ascii', 'ignore').strip()

    json_response = requests.post('http://10.144.43.10:41003/predict',
                                  data=data,
                                  headers=headers)
    
    preds = json_response.json()['predictions'] # get predictions for the q-a tuples set
    outputs = json_response.json()['outputs']
    # END: here goes the code used to predict is answer valid or not
    
    answers = list()
    for j in range(len(preds)): # iterate over predictions for each candidate
        if not preds[j] and max(softmax(outputs[j])) > 0.99: # if model says NOT TRUE we check how confident it is
            continue
        else:
            answers.append({'is_true': q['response'][j]['is_true']})
    qanswer_results_filtered.append({'response': answers})

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=8803.0), HTML(value='')))




In [38]:
precision_at_k(qanswer_results_filtered, 1), precision_at_k(qanswer_results_filtered, 5)

(0.2948073701842546, 0.11830262423227819)

In [39]:
ndcg_at_k(qanswer_results_filtered, 1), ndcg_at_k(qanswer_results_filtered, 5)

(0.2948073701842546, 0.3787585257378004)

In [None]:
# 0.3028, 0.4234566666666666
# 0.3028, 0.45180666666666663

In [37]:
json_save("../processed_data/VANILLA/qanswer_results_full_filtered-int.json", qanswer_results_filtered)
# qanswer_results_filtered = json_load("../processed_data/VANILLA/qanswer_results_filtered.json")

In [395]:
_ = list()
for q in qanswer_results_filtered:
    _.append(len(q['response']))

print(sum(_)/len(_) - 60)

-43.909800000000004
