In [1]:
import json
import re
import time
from SPARQLWrapper import SPARQLWrapper, JSON
import sqlite3
from tqdm import tqdm_notebook as tqdm
cache3 = sqlite3.connect('../processed_data/VANILLA/cache.db')

In [2]:
def json_load(name):
    with open(f'{name}', 'r', encoding = 'utf-8') as f:
        return json.load(f)
    
def json_save(name, item):
    with open(f'{name}', 'w', encoding = 'utf-8') as f:
        json.dump(item, f, ensure_ascii = False, indent = 4)

In [3]:
def get_labels_from_db(URI):
    labels = list()
    cursor = cache3.execute('SELECT value FROM main WHERE key=(?) LIMIT 1', (URI, ))
    labels += cursor.fetchall()
    
    if len(labels) > 0:
        return [l[0] for l in labels]
    else:
        return []

In [4]:
def get_uris_from_response(response):
    uris = list()
    for r in response:
        for k in list(r.keys()):
            if 'wikidata' in r[k]:
                uris.append(r[k])
    
    return uris

In [5]:
def transform_prefix_uri_to_no_prefix(URIs):
    transformed = list()
    for URI in URIs:
        if 'wdt:' in URI:
            URI = URI.replace('wdt:', 'http://www.wikidata.org/prop/direct/')
        elif 'wd:' in URI:
            URI = URI.replace('wd:', 'http://www.wikidata.org/entity')
        if '<' in URI and '>' in URI:
            URI = URI.replace('<', '').replace('>', '')
        transformed.append(URI)
    return transformed

In [23]:
# vanilla_5000 = json_load("../processed_data/VANILLA/vanilla_5000_evaluate.json")
all_test = list()
with open("../processed_data/VANILLA/vanilla-all.json", 'r', encoding = 'utf-8') as f:
    for i in f:
        all_test.append(json.loads(i))

In [24]:
query = all_test[0]['SPARQL list'][0]['SPARQL']

In [25]:
response = all_test[0]['SPARQL list'][0]['response']

In [26]:
prefix_ents = re.findall(r"(?<!\S)wd\S*:\S+", query)
no_prefix_ents = re.findall(r"<(.*?)>", query)

In [27]:
for ent in no_prefix_ents:
    print(get_labels_from_db(ent))

['Claude Nicolas Emmery']
['sex or gender']


In [32]:
test_full_labels = list()
for question in tqdm(all_test):
    candidates_label_list = list()
    for candidate in question['SPARQL list']:
        query = candidate['SPARQL']
        response = candidate['response']
        
        # retrieve URIs
        prefix_ents = re.findall(r"(?<!\S)wd\S*:\S+", query)
        no_prefix_ents = re.findall(r"<(.*?)>", query)
        if response:
            no_prefix_ents += get_uris_from_response(response)
        no_prefix_ents += transform_prefix_uri_to_no_prefix(prefix_ents)
        del prefix_ents
        
        # get labels for the URIs
        labels = list()
        for ent in no_prefix_ents:
            labels += get_labels_from_db(ent)
        
        candidate_precision = int(candidate['evaluate'][0]/candidate['evaluate'][1]) if candidate['evaluate'][1] else 0
        #candidate_precision = 0 if not response or len(response) == 0 else int(candidate['evaluate'][0]/candidate['evaluate'][1])
        is_true = candidate_precision # False if candidate_precision < 0.5 else True
        candidates_label_list.append({'label': ' '.join(l for l in labels if l), 'is_true': is_true})
    
    test_full_labels.append({
        'question': question['question'],
        'question_id': question['question_id'],
        'response': candidates_label_list
    })

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=8955.0), HTML(value='')))

KeyError: 'evaluate'

In [30]:
json_save("../processed_data/VANILLA/full_test_evaluate_labels-int.json", test_full_labels)

In [31]:
test_full_labels[0]

{'question': 'Which sex does Claude Nicolas Emmery possess ?',
 'question_id': 58504,
 'response': [{'label': 'Claude Nicolas Emmery sex or gender male',
   'is_true': True},
  {'label': 'Claude Lorrain sex or gender male', 'is_true': False},
  {'label': 'Claude Nicolas Emmery', 'is_true': False},
  {'label': 'Sextans', 'is_true': False},
  {'label': 'sexual intercourse', 'is_true': False},
  {'label': 'sex', 'is_true': False},
  {'label': 'Claude sex or gender male organism', 'is_true': False},
  {'label': 'Claude Lorrain', 'is_true': False},
  {'label': 'Claude Nicolas', 'is_true': False},
  {'label': 'Claude Nicolas sex or gender male', 'is_true': False},
  {'label': 'Claude Nicolas Emmery instance of human', 'is_true': False},
  {'label': 'Claude-Nicolas', 'is_true': False},
  {'label': 'Claude sex or gender male', 'is_true': False},
  {'label': 'Nicholas of Lyra sex or gender male', 'is_true': False},
  {'label': 'Nicolas sex or gender male', 'is_true': False},
  {'label': 'Claude