In [11]:
import pandas as pd
import json
import gc

#### If the category is "resource", answer types are ontology classes from either the DBpedia ontology or the Wikidata ontology. If category is "literal", answer types are either "number", "date", or "string". "boolean" answer type. If the category is "boolean", answer type is always "boolean".

In [7]:
dbpedia_json = None
wikidata_json = None

with open('../datasets/DBpedia/smarttask_dbpedia_train.json') as json_file:
    dbpedia_json = json.load(json_file)
    
with open('../datasets/Wikidata/lcquad2_anstype_wikidata_train.json') as json_file:
    wikidata_json = json.load(json_file)

In [8]:
dbpedia_json[0]

{'id': 'dbpedia_1177',
 'question': 'Was Jacqueline Kennedy Onassis a follower of Melkite Greek Catholic Church?',
 'category': 'boolean',
 'type': ['boolean']}

In [9]:
wikidata_json[0]

{'id': 19719,
 'question': 'What periodical literature does Delta Air Lines use as a moutpiece?',
 'category': 'resource',
 'type': ['publication',
  'recurring',
  'intellectual work',
  'text',
  'communication medium',
  'serial']}

In [24]:
def make_dataframe(ds_json):
    ds_dict = { "id": list(), "question": list(), "category": list(), 
               "type_1": list(), "type_2": list(), "type_3": list(), "type_4": list(), "type_5": list()}
    
    for item in ds_json:
        id_ = item['id']
        question = item['question']
        category = item['category']

        if len(item['type']) > 4:
            type_1 = item['type'][0]
            type_2 = item['type'][1]
            type_3 = item['type'][2]
            type_4 = item['type'][3]
            type_5 = item['type'][4]
        elif len(item['type']) > 3:
            type_1 = item['type'][0]
            type_2 = item['type'][1]
            type_3 = item['type'][2]
            type_4 = item['type'][3]
            type_5 = "n/a"
        elif len(item['type']) > 2:
            type_1 = item['type'][0]
            type_2 = item['type'][1]
            type_3 = item['type'][2]
            type_4 = "n/a"
            type_5 = "n/a"
        elif len(item['type']) > 1:
            type_1 = item['type'][0]
            type_2 = item['type'][1]
            type_3 = "n/a"
            type_4 = "n/a"
            type_5 = "n/a"
        elif len(item['type']) > 0:
            type_1 = item['type'][0]
            type_2 = "n/a"
            type_3 = "n/a"
            type_4 = "n/a"
            type_5 = "n/a"
        else:
            type_1 = "n/a"
            type_2 = "n/a"
            type_3 = "n/a"
            type_4 = "n/a"
            type_5 = "n/a"
            
        ds_dict['id'].append(id_)
        ds_dict['question'].append(question)
        ds_dict['category'].append(category)
        ds_dict['type_1'].append(type_1)
        ds_dict['type_2'].append(type_2)
        ds_dict['type_3'].append(type_3)
        ds_dict['type_4'].append(type_4)
        ds_dict['type_5'].append(type_5)
    
    return pd.DataFrame.from_dict(ds_dict)

dbpedia_df = make_dataframe(dbpedia_json)
wikidata_df = make_dataframe(wikidata_json)
    
gc.collect()

0

In [25]:
dbpedia_df.head()

Unnamed: 0,id,question,category,type_1,type_2,type_3,type_4,type_5
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,boolean,,,,
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,dbo:Opera,dbo:MusicalWork,dbo:Work,,
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,date,,,,
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,boolean,,,,
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,dbo:EducationalInstitution,dbo:Organisation,dbo:Agent,,


In [26]:
wikidata_df.head()

Unnamed: 0,id,question,category,type_1,type_2,type_3,type_4,type_5
0,19719,What periodical literature does Delta Air Line...,resource,publication,recurring,intellectual work,text,communication medium
1,15554,Who is the child of Ranavalona I's husband?,resource,person,omnivore,natural person,,
2,974,Is it true Jeff_Bridges occupation Lane Chandl...,boolean,boolean,,,,
3,27610,Which is the operating income for Qantas?,literal,number,,,,
4,24488,which cola starts with the letter p,resource,soft drink,trademark,carbonated beverage,non-alcoholic beverage,symbol


In [27]:
dbpedia_df.category.value_counts()

resource    9584
literal     5188
boolean     2799
Name: category, dtype: int64

In [28]:
wikidata_df.category.value_counts()

resource    11683
literal      4429
boolean      2139
Name: category, dtype: int64

In [30]:
dbpedia_df.type_1.value_counts()

boolean                  2799
dbo:Person               2087
string                   2068
number                   1634
date                     1486
                         ... 
dbo:Mollusca                1
dbo:WorldHeritageSite       1
dbo:MountainPass            1
dbo:Gymnast                 1
dbo:Sea                     1
Name: type_1, Length: 271, dtype: int64

In [31]:
wikidata_df.type_1.value_counts()

boolean                 2139
string                  2086
date                    1301
number                  1044
person                   731
                        ... 
room                       1
monumental sculpture       1
poetics                    1
error                      1
white dwarf                1
Name: type_1, Length: 1682, dtype: int64

In [32]:
#dbpedia_df.to_csv("../datasets/dbpedia.csv", sep="|", index=False)
#wikidata_df.to_csv("../datasets/wikidata.csv", sep="|", index=False)

In [35]:
dbpedia_df[dbpedia_df.category == "resource"].type_1.value_counts()

dbo:Person               2087
dbo:Country               716
dbo:City                  672
dbo:Company               371
dbo:Award                 339
                         ... 
dbo:Holiday                 1
dbo:AdultActor              1
dbo:Monument                1
dbo:Anime                   1
dbo:WorldHeritageSite       1
Name: type_1, Length: 267, dtype: int64

In [36]:
wikidata_df[wikidata_df.category == "resource"].type_1.value_counts()

person                     731
omnivore                   651
natural person             466
city/town                  263
state                      247
                          ... 
record label                 1
federal holiday              1
textile                      1
active galactic nucleus      1
white dwarf                  1
Name: type_1, Length: 1679, dtype: int64

In [42]:
dbpedia_df[dbpedia_df.category == "literal"].type_1.value_counts()

string    2068
number    1634
date      1486
Name: type_1, dtype: int64

In [41]:
wikidata_df[wikidata_df.category == "literal"].type_1.value_counts()

string    2086
date      1301
number    1042
Name: type_1, dtype: int64