In [4]:
import pandas as pd
import json

import uuid

In [2]:
with open("../../freebase_api/top_level_dbpedia_types.json", 'r', encoding='utf-8') as f:
    top_level_json = json.load(f)

In [3]:
top_level_types = list()

for type_ in top_level_json['results']['bindings']:
    top_level_types.append(type_['type']['value'])

## QALD

In [12]:
qald_train = pd.read_csv("../../data/QALD/rdf/QALD-train.csv")
qald_test = pd.read_csv("../../data/QALD/rdf/QALD-test.csv")
print(qald_train.shape, qald_test.shape)

(393, 3) (141, 3)


In [13]:
qald_train.head()

Unnamed: 0,question,questionText,type
0,urn:QALD:id:1,List all boardgames by GMT.,http://dbpedia.org/ontology/Activity
1,urn:QALD:id:290,List all games by GMT.,http://dbpedia.org/ontology/Activity
2,urn:QALD:id:4,In which U.S. state is Area 51 located?,http://dbpedia.org/ontology/Place
3,urn:QALD:id:6,Which countries have places with more than two...,http://dbpedia.org/ontology/Place
4,urn:QALD:id:7,Where did Abraham Lincoln die?,http://dbpedia.org/ontology/Place


In [14]:
def add_missing_questions(dataset):
    #determine missing types
    non_existing_types = []
    for type_ in top_level_types:
        if type_ not in dataset.type.unique():
            non_existing_types.append(type_)
    
    #filli dict with fantom questions of missing types
    fantom_questions = {'question': [], 'questionText': [], 'type': []}
    for type_ in non_existing_types:
        fantom_questions['question'].append('fantom')
        fantom_questions['questionText'].append("This is fantom {type_} question".format(type_=type_.replace("http://dbpedia.org/ontology/", "")))
        fantom_questions['type'].append(type_)

    #append missing questions to the original dataset
    dataset = dataset.append(pd.DataFrame.from_dict(fantom_questions))
    return dataset


ADD_MISSING_QUESTIONS = True

if ADD_MISSING_QUESTIONS:
    qald_train = add_missing_questions(qald_train)
    qald_test = add_missing_questions(qald_test)

In [18]:
for dataset in [qald_train, qald_test]:
    #renaming primitive types
    dataset.replace(['urn:QALD:answertype:number',
                        'urn:QALD:answertype:boolean', 
                        'urn:QALD:answertype:string', 
                        'urn:QALD:answertype:date', 
                        'urn:QALD:answertype:uri'],
                        ['Number', 'Boolean', 'String', 'DateTime', 'String'], inplace=True)
    
    #renaming dbpedia types
    dataset.type = dataset.type.apply(lambda x: str(x.replace("http://dbpedia.org/ontology/","")))

In [19]:
qald_train.head()

Unnamed: 0,question,questionText,type
0,urn:QALD:id:1,List all boardgames by GMT.,Activity
1,urn:QALD:id:290,List all games by GMT.,Activity
2,urn:QALD:id:4,In which U.S. state is Area 51 located?,Place
3,urn:QALD:id:6,Which countries have places with more than two...,Place
4,urn:QALD:id:7,Where did Abraham Lincoln die?,Place


In [20]:
qald_train.type.value_counts()

Agent                            125
Place                             87
Number                            52
Work                              38
Boolean                           37
DateTime                          15
String                            12
Language                           6
MeanOfTransportation               3
Disease                            3
Holiday                            3
Food                               2
Currency                           2
Activity                           2
Device                             2
Relationship                       1
Altitude                           1
GrossDomesticProduct               1
PersonFunction                     1
PublicService                      1
AnatomicalStructure                1
ElectionDiagram                    1
RouteStop                          1
Medicine                           1
Event                              1
Demographics                       1
Flag                               1
D

In [21]:
#saving final dataset to the files
qald_train.to_csv("../../data/QALD/prep_dbpedia/QALD-train.csv", index=False, sep=';')
qald_test.to_csv("../../data/QALD/prep_dbpedia/QALD-test.csv", index=False, sep=';')

## LC-QuAD

In [7]:
lcquad_train = pd.read_csv("../../data/LC-QuAD/RDF/LC-QuAD-train.csv")
lcquad_test = pd.read_csv("../../data/LC-QuAD/RDF/LC-QuAD-test.csv")

lcquad_train = lcquad_train[lcquad_train.type != 'urn:LCQuAD:answertype:remove']
lcquad_test = lcquad_test[lcquad_test.type != 'urn:LCQuAD:answertype:remove']

print(lcquad_train.shape, lcquad_test.shape)

(1868, 3) (462, 3)


In [8]:
lcquad_train.head()

Unnamed: 0,question,questionText,type
0,urn:LCQuAD:id:4635,What is the occupation of the Irving Chernev a...,http://dbpedia.org/ontology/Activity
1,urn:LCQuAD:id:2527,What sport amongst the one Lagos Preparatory S...,http://dbpedia.org/ontology/Activity
2,urn:LCQuAD:id:151,Which sports are played at institues in Taguig?,http://dbpedia.org/ontology/Activity
3,urn:LCQuAD:id:4863,List the common sports between Fr. Agnel Multi...,http://dbpedia.org/ontology/Activity
4,urn:LCQuAD:id:4058,What is the game whose genre is Mind sport?,http://dbpedia.org/ontology/Activity


In [9]:
for dataset in [lcquad_train, lcquad_test]:
    #renaming primitive types
    dataset.replace(['urn:LCQuAD:answertype:number',
                        'urn:LCQuAD:answertype:boolean', 
                        'urn:LCQuAD:answertype:string', 
                        'urn:LCQuAD:answertype:date', 
                        'urn:LCQuAD:answertype:uri'],
                        ['Number', 'Boolean', 'String', 'DateTime', 'String'], inplace=True)
    
    #renaming dbpedia types
    dataset.type = dataset.type.apply(lambda x: str(x.replace("http://dbpedia.org/ontology/","")))

In [10]:
lcquad_train.head()

Unnamed: 0,question,questionText,type
0,urn:LCQuAD:id:4635,What is the occupation of the Irving Chernev a...,Activity
1,urn:LCQuAD:id:2527,What sport amongst the one Lagos Preparatory S...,Activity
2,urn:LCQuAD:id:151,Which sports are played at institues in Taguig?,Activity
3,urn:LCQuAD:id:4863,List the common sports between Fr. Agnel Multi...,Activity
4,urn:LCQuAD:id:4058,What is the game whose genre is Mind sport?,Activity


In [12]:
lcquad_train.type.value_counts()

Number                  535
Agent                   439
Place                   311
Boolean                 285
Work                    137
Activity                 38
EthnicGroup              23
Award                    23
String                   19
Event                    14
Species                  13
Device                    9
Disease                   5
Food                      4
Currency                  3
Language                  3
AnatomicalStructure       3
MeanOfTransportation      2
PersonFunction            1
TopicalConcept            1
Name: type, dtype: int64

In [13]:
#saving final dataset to the files
lcquad_train.to_csv("../../data/LC-QuAD/prep_dbpedia/LC-QuAD-train.csv", index=False, sep=';')
lcquad_test.to_csv("../../data/LC-QuAD/prep_dbpedia/LC-QuAD-test.csv", index=False, sep=';')

## WebQuestions

In [100]:
webq_train = pd.read_csv("../../data/WebQuestions/Manually Labeled/train.csv")
webq_test = pd.read_csv("../../data/WebQuestions/Manually Labeled/test.csv")
webq_val = pd.read_csv("../../data/WebQuestions/Manually Labeled/val.csv")
webq_devtest = pd.read_csv("../../data/WebQuestions/Manually Labeled/devtest.csv")


webq_train = webq_train[webq_train['Manual Annotation'] != 'REMOVE']
webq_test = webq_test[webq_test['Manual Annotation'] != 'REMOVE']
webq_val = webq_val[webq_val['Manual Annotation'] != 'REMOVE']
webq_devtest = webq_devtest[webq_devtest['Manual Annotation'] != 'REMOVE']

print(webq_train.shape, webq_test.shape)

(2779, 8) (2007, 7)


In [101]:
webq_train.head()

Unnamed: 0,Number,question,Annotated Class,Manual Annotation,class,Manual Check,Remarks,Number of empty annotations: 1
0,1,during what war did abraham lincoln serve as p...,Event,"Event (Competition, LifeCycleEvent, NaturalEve...",DATE-TIME,,,
1,2,from which university did president obama rece...,Place,"Place (ArchitecturalStructure, CelestialBody, ...",LOC,,,
2,3,how long is queen victoria's reign?,Number,"Number (Integer, Float, Salary, Height)",['base.kwebbase.kwsentence'],,,
3,4,how many australian states and territories?,Number,"Number (Integer, Float, Salary, Height)",LOC,,,
4,5,how many languages are there in the philippines?,Number,"Number (Integer, Float, Salary, Height)",LANG,,,


In [102]:
i = 0
names = ['Train', 'Test', 'Val', 'Devtest']

for dataset in [webq_train, webq_test, webq_val, webq_devtest]:
    #renaming primitive types
    dataset.replace(['Text'], ['String'], inplace=True)
    
    dataset.rename(columns={'Number': 'question', 'question': 'questionText', 'Annotated Class': 'type'}, inplace=True)
    #renaming dbpedia types
    dataset.question = dataset.question.apply(lambda x: "urn:WebQuestions{0}:id:{1}".format(names[i], str(x).replace('.0','')))
    dataset.drop(dataset.columns.difference(['question','questionText', 'type']), axis=1, inplace=True)
    i = i + 1

In [103]:
webq_train.head()

Unnamed: 0,question,questionText,type
0,urn:WebQuestionsTrain:id:1,during what war did abraham lincoln serve as p...,Event
1,urn:WebQuestionsTrain:id:2,from which university did president obama rece...,Place
2,urn:WebQuestionsTrain:id:3,how long is queen victoria's reign?,Number
3,urn:WebQuestionsTrain:id:4,how many australian states and territories?,Number
4,urn:WebQuestionsTrain:id:5,how many languages are there in the philippines?,Number


In [104]:
webq_train = webq_train.append(webq_devtest)
webq_test = webq_test.append(webq_val)

In [105]:
webq_train[webq_train['type'].isna()]

Unnamed: 0,question,questionText,type


In [106]:
any(webq_train.type.isna())

False

In [107]:
#saving final dataset to the files
webq_train.to_csv("../../data/WebQuestions/prep_dbpedia/WebQuestions-train.csv", index=False, sep=';')
webq_test.to_csv("../../data/WebQuestions/prep_dbpedia/WebQuestions-test.csv", index=False, sep=';')

In [108]:
webq_train.shape

(2966, 3)

In [109]:
webq_test.shape

(2752, 3)

## CogComp

In [112]:
cogcomp = pd.read_csv("../../data/CogComp/train_5500_prep.label",sep="=")
cogcomp.rename(columns={'question': 'questionText', 'subclass': 'type'}, inplace=True)

In [113]:
cogcomp.head()

Unnamed: 0,class,type,questionText
0,DESC,manner,How did serfdom develop in and then leave Russ...
1,ENTY,cremat,What films featured the character Popeye Doyle ?
2,DESC,manner,How can I find a list of celebrities ' real na...
3,ENTY,animal,What fowl grabs the spotlight after the Chines...
4,ABBR,exp,What is the full form of .com ?


In [114]:
cogcomp_classes = pd.read_csv("../../data/CogComp/CogCompTypes.csv")

In [115]:
class_map = dict()
for i,row in cogcomp_classes.iterrows():
    class_map[row['CogComp Class']] = row['Class']

In [116]:
cogcomp.type = cogcomp.type.apply(lambda x: class_map[x])

In [117]:
cogcomp = cogcomp[cogcomp['type'] != 'REMOVE']
cogcomp = cogcomp[cogcomp['type'] != 'Boolean']
cogcomp.head()

Unnamed: 0,class,type,questionText
1,ENTY,Work,What films featured the character Popeye Doyle ?
3,ENTY,Species,What fowl grabs the spotlight after the Chines...
4,ABBR,Text,What is the full form of .com ?
5,HUM,Agent,What contemptible scoundrel stole the cork fro...
6,HUM,Agent,What team did baseball 's St. Louis Browns bec...


In [118]:
cogcomp['question'] = ["urn:CogComp:id:{0}".format(i) for i in range(cogcomp.shape[0])]
cogcomp.drop('class', axis=1, inplace=True)

In [119]:
cogcomp.head()

Unnamed: 0,type,questionText,question
1,Work,What films featured the character Popeye Doyle ?,urn:CogComp:id:0
3,Species,What fowl grabs the spotlight after the Chines...,urn:CogComp:id:1
4,Text,What is the full form of .com ?,urn:CogComp:id:2
5,Agent,What contemptible scoundrel stole the cork fro...,urn:CogComp:id:3
6,Agent,What team did baseball 's St. Louis Browns bec...,urn:CogComp:id:4


In [120]:
cogcomp.type.value_counts()

Number                  1359
Agent                   1151
Text                     898
Place                    371
DateTime                 218
Work                     207
TopicalConcept           131
Species                  125
Food                     103
Disease                  103
Activity                  62
Event                     56
ChemicalSubstance         41
Colour                    40
MeanOfTransportation      27
Language                  16
AnatomicalStructure       16
Device                    10
Currency                   4
Name: type, dtype: int64

In [121]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(cogcomp, shuffle=True, random_state=42,)

In [122]:
train.to_csv("../../data/CogComp/pred_dbpedia/CogComp-train.csv", index=False, sep=';')
test.to_csv("../../data/CogComp/pred_dbpedia/CogComp-test.csv", index=False, sep=';')

In [4]:
train = pd.read_csv("../../data/CogComp/pred_dbpedia/CogComp-train.csv", sep=';')
test = pd.read_csv("../../data/CogComp/pred_dbpedia/CogComp-test.csv", sep=';')

In [5]:
for dataset in [train, test]:
    #renaming primitive types
    dataset.replace(['Text'], ['String'], inplace=True)

In [7]:
train.to_csv("../../data/CogComp/pred_dbpedia/CogComp-train.csv", index=False, sep=';')
test.to_csv("../../data/CogComp/pred_dbpedia/CogComp-test.csv", index=False, sep=';')

## SimpleQuestions

In [124]:
sq_train = pd.read_csv("../../data/SimpleQuestions/RDF/SimpleQuestions-train.csv")
sq_test = pd.read_csv("../../data/SimpleQuestions/RDF/SimpleQuestions-test.csv")
sq_val = pd.read_csv("../../data/SimpleQuestions/RDF/SimpleQuestions-val.csv")
print(sq_train.shape, sq_test.shape, sq_val.shape)

(41346, 3) (11950, 3) (5842, 3)


In [125]:
sq_train.head()

Unnamed: 0,question,questionText,type
0,urn:SimpleQuestions:number:78,What sport does notre dame fighting irish men'...,http://dbpedia.org/ontology/Activity
1,urn:SimpleQuestions:number:401,What sport is crici��ma esporte clube a part of?,http://dbpedia.org/ontology/Activity
2,urn:SimpleQuestions:number:1084,What sport did ron blaylock play,http://dbpedia.org/ontology/Activity
3,urn:SimpleQuestions:number:1172,which game was published by hans im gl��ck,http://dbpedia.org/ontology/Activity
4,urn:SimpleQuestions:number:1891,what sport does michigan state spartans footba...,http://dbpedia.org/ontology/Activity


In [126]:
for dataset in [sq_train, sq_test, sq_val]:    
    #renaming dbpedia types
    dataset.type = dataset.type.apply(lambda x: str(x.replace("http://dbpedia.org/ontology/","")))

In [127]:
sq_train.type.value_counts()

Place                   14482
Agent                   12161
Work                     6192
TopicalConcept           2899
Language                 2262
Disease                   636
EthnicGroup               507
Event                     503
ChemicalSubstance         424
Colour                    343
MeanOfTransportation      335
Species                   197
Device                    140
Activity                  122
Food                       88
AnatomicalStructure        21
Award                      16
Holiday                    13
Currency                    4
Biomolecule                 1
Name: type, dtype: int64

In [128]:
#saving final dataset to the files
sq_train.to_csv("../../data/SimpleQuestions/prep_dbpedia/SimpleQuestions-train.csv", index=False, sep=';')
sq_test.append(sq_val).to_csv("../../data/SimpleQuestions/prep_dbpedia/SimpleQuestions-test.csv", index=False, sep=';')

## Combinations

In [11]:
import itertools

datasets = ['LC-QuAD', 'QALD', 'CogComp', 'WebQuestions', 'SimpleQuestions']

In [12]:
combs = list()
for i in range(2,6):
    combs.append(list(itertools.combinations(datasets, i)))

In [13]:
for comb in combs:
    for ds in comb:
        datasets = list()
        ds_name = str()
        
        for name in ds:
            datasets.append(pd.read_csv("../../data/UnifiedSubclassDBpedia/{0}-train.csv".format(name), sep=';'))
            
            if ds_name == "":
                ds_name=name
            else:
                ds_name+='+'+name
        
        dataset = datasets[0]
        for _ in datasets[1:]:
            dataset = dataset.append(_)
        
        dataset.to_csv("../../data/UnifiedSubclassDBpedia/{0}-train.csv".format(ds_name), index=False, sep=';')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [14]:
combs[0]

[('LC-QuAD', 'QALD'),
 ('LC-QuAD', 'CogComp'),
 ('LC-QuAD', 'WebQuestions'),
 ('LC-QuAD', 'SimpleQuestions'),
 ('QALD', 'CogComp'),
 ('QALD', 'WebQuestions'),
 ('QALD', 'SimpleQuestions'),
 ('CogComp', 'WebQuestions'),
 ('CogComp', 'SimpleQuestions'),
 ('WebQuestions', 'SimpleQuestions')]

In [15]:
names = list()

for comb in combs:
    for ds in comb:
        ds_name = str()
        
        for name in ds:
            if ds_name == "":
                ds_name=name
            else:
                ds_name+='+'+name
                
        names.append(ds_name)

In [16]:
names

['LC-QuAD+QALD',
 'LC-QuAD+CogComp',
 'LC-QuAD+WebQuestions',
 'LC-QuAD+SimpleQuestions',
 'QALD+CogComp',
 'QALD+WebQuestions',
 'QALD+SimpleQuestions',
 'CogComp+WebQuestions',
 'CogComp+SimpleQuestions',
 'WebQuestions+SimpleQuestions',
 'LC-QuAD+QALD+CogComp',
 'LC-QuAD+QALD+WebQuestions',
 'LC-QuAD+QALD+SimpleQuestions',
 'LC-QuAD+CogComp+WebQuestions',
 'LC-QuAD+CogComp+SimpleQuestions',
 'LC-QuAD+WebQuestions+SimpleQuestions',
 'QALD+CogComp+WebQuestions',
 'QALD+CogComp+SimpleQuestions',
 'QALD+WebQuestions+SimpleQuestions',
 'CogComp+WebQuestions+SimpleQuestions',
 'LC-QuAD+QALD+CogComp+WebQuestions',
 'LC-QuAD+QALD+CogComp+SimpleQuestions',
 'LC-QuAD+QALD+WebQuestions+SimpleQuestions',
 'LC-QuAD+CogComp+WebQuestions+SimpleQuestions',
 'QALD+CogComp+WebQuestions+SimpleQuestions',
 'LC-QuAD+QALD+CogComp+WebQuestions+SimpleQuestions']

In [17]:
len(names)

26