In [1]:
%run ../utils/dataset_building.py

In [2]:
import sparql
from tqdm.notebook import trange

# key: type subclass name, value: type superclass name
# the dict will be used to map subclasses to superclasses by going up in the hierarchy
# until each subclass is translated to a top-level superclass
class_hierarchy_dict={}

# 200 requests are more than enough, the last ones (after ~150) returned empty responses
for offset in trange(200):
    # select all subclass-superclass relations in DBPedia
    # 10000 is the maximum allowed amount of results by query, therefore to get all the relations,
    # we apply offsets in the result indexes
    q = (f"""select ?subclass ?superclass {{
        ?subclass rdfs:subClassOf ?superclass
        }}
        limit 10000
        offset {offset*10000}""")
    result = sparql.query('http://dbpedia.org/sparql', q).fetchall()
    for row in result:
        subclass = row[0].value
        superclass = row[1].value
        if subclass.startswith("http://dbpedia.org/ontology/") and (superclass.startswith("http://dbpedia.org/ontology/") or superclass.endswith("Thing")):
            subclass=subclass.split("/")[-1]
            superclass=superclass.split("/")[-1]
            
            # the initial idea was that a subclass could have multiple superclasses therefore
            # the superclasses were stored in a set for each subclass
            if subclass not in class_hierarchy_dict.keys():
                class_hierarchy_dict[subclass]=set()
            class_hierarchy_dict[subclass].add(superclass)

  0%|          | 0/200 [00:00<?, ?it/s]

In [3]:
# if the maximum superclass set length is 1 it means that each subclass has only one superclass
max([len(superclasses) for superclasses in class_hierarchy_dict.values()])

1

In [5]:
for subclass,superclasses in class_hierarchy_dict.items():
    # actually each sublass had only one single superclass, therefore we transform the superclass sets to strings
    class_hierarchy_dict[subclass] = "".join(superclasses)

In [7]:
dump_json(class_hierarchy_dict,"../data/type_hierarchy.json")

In [53]:
# top-level DBPedia types have "owl#Thing" as superclass
toplevel_types=set([subclass for subclass,superclass in class_hierarchy_dict.items() if superclass == 'owl#Thing'])
# "Agent" is too general to be used as type (as described in the original paper), 
# therefore we include Agent's subclasses in top-level types
toplevel_types|=set([subclass for subclass,superclass in class_hierarchy_dict.items() if superclass == 'Agent'])
toplevel_types.remove('Agent')

# there were some articles that had "TimeInterval" as type, and this type has no superclass
toplevel_types.add("TimeInterval")

In [51]:
len(toplevel_types)

60

In [64]:
# get all top-level types that were used in the original paper
epfl_types=set()
nb_articles_with_type_2017=0
with open("../data/epfl_paper/article_types_dbpedia.tsv") as file:
    for line in file:
        article_type=line.split()[0]
        epfl_types.add(article_type)
        nb_articles_with_type_2017+=1

In [55]:
# check if our top-level types match those used in the original paper
epfl_types-toplevel_types

{'NaturalEvent'}

In [20]:
# check if there are any new top-level types since 2017
toplevel_types-epfl_types

{'Algorithm',
 'Altitude',
 'Area',
 'Award',
 'Blazon',
 'Browser',
 'ChartsPlacements',
 'Cipher',
 'Colour',
 'Currency',
 'Deity',
 'Demographics',
 'Depth',
 'Diploma',
 'Disease',
 'ElectionDiagram',
 'Employer',
 'EthnicGroup',
 'Family',
 'FictionalCharacter',
 'FileSystem',
 'Flag',
 'GeneLocation',
 'GrossDomesticProduct',
 'GrossDomesticProductPerCapita',
 'Identifier',
 'List',
 'Media',
 'MedicalSpecialty',
 'Medicine',
 'PersonFunction',
 'Population',
 'Protocol',
 'PublicService',
 'Relationship',
 'Spreadsheet',
 'StarCluster',
 'Statistic',
 'Tank',
 'TimeInterval',
 'Unknown'}

In [62]:
article_titles_to_get=set()

# key: article id, value: article title
article_id_title_dict={}
# key: article title, value: article id
article_title_id_dict={}

article_ids_titles_tuple_list=read_tuple_list_from_file((int,str),"../data/article_ids_titles.tsv")

for article_id,article_title in article_ids_titles_tuple_list:
    article_titles_to_get.add(article_title)
    article_title_id_dict[article_title]=article_id
    article_id_title_dict[article_id]=article_title

In [54]:
import urllib.parse

article_title_type_dict={}
with open("../data/instance-types_lang=en_specific.ttl") as file:  
    # line example: <http://dbpedia.org/resource/Étude_Op._25,_No._2_(Chopin)> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/ClassicalMusicComposition> .
    for line in file:
        splited_line=line.split()
        # resource name example in file: <http://dbpedia.org/resource/Étude_Op._25,_No._2_(Chopin)>
        # we need to extract Étude_Op._25,_No._2_(Chopin):
        resource_name=urllib.parse.unquote(splited_line[0].split("/")[-1][:-1])
        # dbpedia type example in file: <http://www.w3.org/2002/07/ClassicalMusicComposition>
        # we need to extract ClassicalMusicComposition:
        dbpedia_type=splited_line[2].split("/")[-1][:-1]
        if resource_name in article_titles_to_get and dbpedia_type!="owl#Thing" and dbpedia_type!="Agent":
            # we translate subclass types to superclass types until we reach the top of the class hierarchy
            while dbpedia_type not in toplevel_types:
                dbpedia_type=class_hierarchy_dict[dbpedia_type]
            article_title_type_dict[resource_name]=dbpedia_type

In [44]:
retrieved_types_set=set(article_title_type_dict.values())

In [46]:
# check if all top-level types used in the original paper were retrieved
epfl_types-retrieved_types_set

{'NaturalEvent'}

In [45]:
# check which top-level types were added since 2017
retrieved_types_set-epfl_types

{'Award',
 'Colour',
 'Currency',
 'Disease',
 'EthnicGroup',
 'FictionalCharacter',
 'Identifier',
 'MedicalSpecialty',
 'TimeInterval'}

In [58]:
# tuple list needed as input for the WCNPruning tool in this form: (article id, article type)
article_ids_types=[]
for article_title,article_type in article_title_type_dict.items():
    article_id=article_title_id_dict[article_title]
    article_ids_types.append((article_id,article_type))

In [60]:
write_tuple_list_to_file(article_ids_types,"../data/article_types.tsv")

In [63]:
# check which proportion of articles have a defined type
len(article_ids_types)/len(article_titles_to_get)

0.5774875100007044

In [None]:
article_ids_2017=set()
with open("../data/epfl_paper/article_categories_sept17.tsv") as file:
    for line in file:
        article_ids_2017.add(int(line.split()[0]))

In [67]:
# check which proportion of articles had a defined type in 2017
nb_articles_with_type_2017/len(article_ids_2017)

0.5259709994922241