In [1]:
import json
import pandas as pd
from sklearn.model_selection import StratifiedKFold

In [72]:
literal_dict = {
    'http://www.w3.org/2001/XMLSchema#date': 'date',
    'http://www.w3.org/2001/XMLSchema#dateTime': 'date',
    'http://www.w3.org/2001/XMLSchema#decimal': 'number',
    'http://www.w3.org/2001/XMLSchema#integer': 'number'
}

In [90]:
def read_json(filename):
    with open(filename) as f:
        return json.load(f)
    
def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
        
def find_questions(dataset, source_data, id_key, question_key):
    def find_by(uid, source_data):
        for j in source_data:
            if uid == j[id_key]:
                return j[question_key]
        return None
    
    new_dataset = list()
    
    for i in dataset:
        question = find_by(i['uid'], source_data)
        if question and any(i['category'] == c for c in ['literal', 'resource', 'boolean']) and len(i['type']) > 0:
            if i['category'] == 'literal' and not any(t in i['type'][0] for t in ['MathML', 'wktLiteral']):
                i['type'] = [literal_dict[i['type'][0]]]
            elif any(t in i['type'][0] for t in ['MathML', 'wktLiteral']):
                continue
            i['question'] = question
            i['type'] = [t.replace("http://dbpedia.org/ontology/", "dbo:") for t in i['type']]
            new_dataset.append(i)
    
    return new_dataset

In [91]:
lcquad2_train = read_json("../../data/lcquad/train.json")
lcquad2_test =  read_json("../../data/lcquad/test.json")

lcquad_train = read_json("../../data/lcquad/train-data.json")
lcquad_test = read_json("../../data/lcquad/test-data.json")

lcquad2_train_types = read_json("../../data/at/wikidata/external_data/lcquad-2-train-types.json")
lcquad2_test_types = read_json("../../data/at/wikidata/external_data/lcquad-2-test-types.json")

lcquad_train_types = read_json("../../data/at/dbpedia/lcquad-train-types.json")
lcquad_test_types = read_json("../../data/at/dbpedia/lcquad-test-types.json")

In [92]:
lcquad_train_types = find_questions(lcquad_train_types, lcquad_train, '_id', 'corrected_question')
lcquad_test_types = find_questions(lcquad_test_types, lcquad_test, '_id', 'corrected_question')

lcquad2_train_types = find_questions(lcquad2_train_types, lcquad2_train, 'uid', 'question')
lcquad2_test_types = find_questions(lcquad_test_types, lcquad2_test, 'uid', 'question')

In [93]:
lcquad2_train_types[8]

{'uid': 18165,
 'category': 'boolean',
 'type': ['boolean'],
 'question': 'Is it true that the carbon footprint of the iPhone X Max is 106?'}

In [94]:
literal_types = list()

for q in lcquad_train_types + lcquad_test_types + lcquad2_train_types:
    if q['category'] == 'literal':
        literal_types += q['type']

In [95]:
set(literal_types)

{'date', 'number'}

In [96]:
def json_to_csv(json_data):
    ids = [q['uid'] for q in json_data]
    questions = [q['question'] for q in json_data]
    categories = [q['category'] for q in json_data]
    types = [q['type'] for q in json_data]
    
    df = pd.DataFrame.from_dict({'id': ids, 'question': questions, 'category': categories, 'type': types})
    
    df = df[(~df.category.isna()) & (~df.type.isna()) & (~df.question.isna()) & (~df.question.isnull())]
    df.question = df.question.apply(lambda x: str(x.replace('{', '').replace('}', ''))).astype(str)
    df = df[df.question != 'n/a']
    
    return df

In [98]:
json_to_csv(lcquad_train_types).to_csv("../../data/at/common/lcquad_train_types.csv", sep='$', index=False)
json_to_csv(lcquad_test_types).to_csv("../../data/at/common/lcquad_test_types.csv", sep='$', index=False)
json_to_csv(lcquad2_train_types).to_csv("../../data/at/common/lcquad2_train_types.csv", sep='$', index=False)
json_to_csv(lcquad2_test_types).to_csv("../../data/at/common/lcquad2_test_types.csv", sep='$', index=False)