In [2]:
import json
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from time import sleep
import ast
from SPARQLWrapper import SPARQLWrapper, JSON

In [3]:
literal_dict = {
    'http://www.w3.org/2001/XMLSchema#date': 'date',
    'http://www.w3.org/2001/XMLSchema#dateTime': 'date',
    'http://www.w3.org/2001/XMLSchema#decimal': 'number',
    'http://www.w3.org/2001/XMLSchema#integer': 'number'
}

In [4]:
def read_json(filename):
    with open(filename) as f:
        return json.load(f)
    
def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
def find_questions(dataset, source_data, id_key, question_key):
    def find_by(uid, source_data):
        for j in source_data:
            if uid == j[id_key]:
                return j[question_key]
        return None
    
    new_dataset = list()
    
    for i in dataset:
        question = find_by(i['uid'], source_data)
        if question and any(i['category'] == c for c in ['literal', 'resource', 'boolean']) and len(i['type']) > 0:
            if i['category'] == 'literal' and not any(t in i['type'][0] for t in ['MathML', 'wktLiteral']):
                i['type'] = [literal_dict[i['type'][0]]]
            elif any(t in i['type'][0] for t in ['MathML', 'wktLiteral']):
                continue
            i['question'] = question
            i['type'] = [t.replace("http://dbpedia.org/ontology/", "dbo:") for t in i['type']]
            new_dataset.append(i)
    
    return new_dataset

In [91]:
lcquad2_train = read_json("../../data/lcquad/train.json")
lcquad2_test =  read_json("../../data/lcquad/test.json")

lcquad_train = read_json("../../data/lcquad/train-data.json")
lcquad_test = read_json("../../data/lcquad/test-data.json")

lcquad2_train_types = read_json("../../data/at/wikidata/external_data/lcquad-2-train-types.json")
lcquad2_test_types = read_json("../../data/at/wikidata/external_data/lcquad-2-test-types.json")

lcquad_train_types = read_json("../../data/at/dbpedia/lcquad-train-types.json")
lcquad_test_types = read_json("../../data/at/dbpedia/lcquad-test-types.json")

In [92]:
lcquad_train_types = find_questions(lcquad_train_types, lcquad_train, '_id', 'corrected_question')
lcquad_test_types = find_questions(lcquad_test_types, lcquad_test, '_id', 'corrected_question')

lcquad2_train_types = find_questions(lcquad2_train_types, lcquad2_train, 'uid', 'question')
lcquad2_test_types = find_questions(lcquad_test_types, lcquad2_test, 'uid', 'question')

In [93]:
lcquad2_train_types[8]

{'uid': 18165,
 'category': 'boolean',
 'type': ['boolean'],
 'question': 'Is it true that the carbon footprint of the iPhone X Max is 106?'}

In [94]:
literal_types = list()

for q in lcquad_train_types + lcquad_test_types + lcquad2_train_types:
    if q['category'] == 'literal':
        literal_types += q['type']

In [95]:
set(literal_types)

{'date', 'number'}

In [96]:
def json_to_csv(json_data):
    ids = [q['uid'] for q in json_data]
    questions = [q['question'] for q in json_data]
    categories = [q['category'] for q in json_data]
    types = [q['type'] for q in json_data]
    
    df = pd.DataFrame.from_dict({'id': ids, 'question': questions, 'category': categories, 'type': types})
    
    df = df[(~df.category.isna()) & (~df.type.isna()) & (~df.question.isna()) & (~df.question.isnull())]
    df.question = df.question.apply(lambda x: str(x.replace('{', '').replace('}', ''))).astype(str)
    df = df[df.question != 'n/a']
    
    return df

In [98]:
json_to_csv(lcquad_train_types).to_csv("../../data/at/common/lcquad_train_types.csv", sep='$', index=False)
json_to_csv(lcquad_test_types).to_csv("../../data/at/common/lcquad_test_types.csv", sep='$', index=False)
json_to_csv(lcquad2_train_types).to_csv("../../data/at/common/lcquad2_train_types.csv", sep='$', index=False)
json_to_csv(lcquad2_test_types).to_csv("../../data/at/common/lcquad2_test_types.csv", sep='$', index=False)

In [5]:
query = """
            PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX bd: <http://www.bigdata.com/rdf#>
            PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>

            SELECT ?name WHERE {{
              SERVICE wikibase:label {{
                bd:serviceParam wikibase:language "en" .
                <{0}> rdfs:label ?name .
              }}
            }}
"""
    
def execute(query: str, endpoint_url: str):
    timeout = 1
    result = False
    e = ''
    while not result or timeout < 5:
        try:
            sleep(timeout)
            sparql = SPARQLWrapper(endpoint_url)
            sparql.setQuery(query)
            sparql.setReturnFormat(JSON)
            response = sparql.query().convert()
            result = True
            return response
        except Exception as e:
            e = str(e)
            if 'MalformedQueryException' in e or 'bad formed' in e:
                print(query)
                return {'error': e}            
            timeout += 1
    
    return {'error': e}

def get_label_wdt(array):
    
    array = ast.literal_eval(array)
    if array[0] in ['boolean', 'number', 'string', 'date']:
        return array
    else:
        label_list = list()
        for e in array:
            if 'http://www.wikidata.org/entity' in e:
                try:
                    result = execute(query.format(e), 'https://query.wikidata.org/bigdata/namespace/wdq/sparql')
                    label = list(result['results']['bindings'][0].values())[0]['value']
                    label_list.append(label)
                except:
                    pass
        print(label_list)
        return label_list

In [8]:
lcquad2_df = pd.read_csv("../../data/at/common/lcquad2_train_types.csv", sep='$')
lcquad2_df = lcquad2_df[lcquad2_df.category == 'resource'].sample(frac=0.1)

In [9]:
lcquad2_df.shape

(1235, 4)

In [10]:
lcquad2_df.type = lcquad2_df.type.apply(lambda x: get_label_wdt(x))

['natural person', 'omnivore', 'person']
['non-governmental organization', 'juridical person', 'political organization']
['ideology', 'world view', 'tradition', 'personal data', 'belief system', 'academic discipline', 'religion or world view', 'pattern of behaviour', 'human behavior', 'religion']
['deity', 'deity', 'Norse mythical character', 'deity']
['film award', 'film award category', 'award for best album']
['declaration', 'document', 'intellectual work', 'document', 'motion', 'document', 'historical source']
['natural person', 'omnivore', 'person']
['natural person', 'omnivore', 'person']
['business', 'enterprise', 'joint-stock company', 'company', 'enterprise', 'organization', 'economic unit', 'juridical person', 'operation', 'business', 'business']
['role', 'artificial entity']
['computer language', 'programming language', 'programming language', 'programming language']
['notation', 'writing system', 'group', 'writing system', 'alphabet', 'writing system']
['taxon', 'extinct ta

In [12]:
lcquad2_df.to_csv("../../data/at/common/lcquad2_train_types-1.csv", sep='$', index=False)

In [11]:
lcquad2_df

Unnamed: 0,id,question,category,type
5045,11985,What was named for Herschel Space Observatory ...,resource,"[natural person, omnivore, person]"
12810,13806,"""Which party membership from Thomas Nast, name...",resource,"[non-governmental organization, juridical pers..."
16719,17208,What denomination of culture was Claude McKay ...,resource,"[ideology, world view, tradition, personal dat..."
2563,25495,'o',resource,"[deity, deity, Norse mythical character, deity]"
540,4232,What award was John Williams nominated for in ...,resource,"[film award, film award category, award for be..."
...,...,...,...,...
12659,24475,WHICH ARE THE AUDIO AND VIDEO INTERFACES AND C...,resource,"[technical standard, electrical connector, opt..."
7732,9599,Who are the parents of Aretha Franklin?,resource,"[natural person, omnivore, person]"
7390,16910,Name the tributary that flows form the harbour...,resource,[natural watercourse]
9156,5893,For what type of work Peter O'Toole was nomina...,resource,"[film award, class of award, film award catego..."
