In [1]:
import pandas as pd
import numpy as np
from io import StringIO

## Load data 

In [2]:
input_path = "/media/rivas/Data1/Data-mining/Kcap/datasets/"

f_dbpedia = input_path + "dbpedia_2016-10.nt"
f_enriched = input_path + "enriched_dump2.nt"
f_drugbank = input_path + "drugbank_dump.nt"

dbpedia_2016 = pd.read_csv(f_dbpedia, delimiter='\t', header=None)
dbpedia_People = pd.read_csv(f_enriched, delimiter='\t', header=None)
drugbank = pd.read_csv(f_drugbank, delimiter='\t', header=None)

print(dbpedia_2016.shape)
print(dbpedia_People.shape)
print(drugbank.shape)

(31050, 1)
(12511, 1)
(766920, 1)


## Separate the data by tab

In [3]:
def transform_to_frame(file):
    length = file.shape[0]
    s = ""
    for i in range(length):
        triplet = str(file.iloc[i][0])
        s_p_o = triplet.split(">")
        if len(s_p_o) == 4 and s_p_o[3]==" .":
            s += "\t".join(str(x)+">" for x in s_p_o[:3]) + " .\n"
        elif len(s_p_o) == 3:
            s += "\t".join(str(x)+">" for x in s_p_o).rstrip(">") + "\n"
        elif len(s_p_o) > 3:
            sub_pred = "\t".join(str(x)+">" for x in s_p_o[:2])
            obj = "".join(str(x)+"^" for x in s_p_o[2:]).rstrip(">")
            s += sub_pred + "\t" + obj + "\n"
        else:
            print("Error Line ",i)
            print(s_p_o)
    return s

## Create dbpedia_2016 DataFrame

In [4]:
string = transform_to_frame(dbpedia_2016)
str_io = StringIO(string)
frame_dbpedia_2016 = pd.read_csv(str_io, sep="\t", header=None)
print(frame_dbpedia_2016.shape)
frame_dbpedia_2016.head()

(31050, 3)


Unnamed: 0,0,1,2
0,<http://dbpedia.org/ontology/>,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://www.w3.org/2002/07/owl#Ontology> .
1,<http://dbpedia.org/ontology/>,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://purl.org/vocommons/voaf#Vocabulary> .
2,<http://dbpedia.org/ontology/>,<http://purl.org/vocab/vann/preferredNamespac...,"""dbo"" ."
3,<http://dbpedia.org/ontology/>,<http://purl.org/vocab/vann/preferredNamespac...,"""http://dbpedia.org/ontology/"" ."
4,<http://dbpedia.org/ontology/>,<http://purl.org/dc/terms/title>,"""The DBpedia Ontology""@en ."


## Create dbpedia_People DataFrame

In [5]:
string = transform_to_frame(dbpedia_People)
str_io = StringIO(string)
frame_dbpedia_People = pd.read_csv(str_io, sep="\t", header=None)
print(frame_dbpedia_People.shape)
frame_dbpedia_People.head()

(12511, 3)


Unnamed: 0,0,1,2
0,<http://dbpedia.org/resource/Dean_Whyte/dump2>,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://www.w3.org/2002/07/owl#Thing> .
1,<http://dbpedia.org/resource/Dean_Whyte/dump2>,<http://dbpedia.org/ontology/abstract>,"""Dean Whyte (born 17 September 1988) is a wat..."
2,<http://dbpedia.org/resource/Dean_Whyte/dump2>,<http://xmlns.com/foaf/0.1/name>,"""Dean Whyte""@en ."
3,<http://dbpedia.org/resource/Dean_Whyte/dump2>,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://www.ontologydesignpatterns.org/ont/du...
4,<http://dbpedia.org/resource/Dean_Whyte/dump2>,<http://www.w3.org/ns/prov#wasDerivedFrom>,<http://en.wikipedia.org/wiki/Dean_Whyte?oldi...


## Create drugbank DataFrame

In [6]:
string = transform_to_frame(drugbank)
str_io = StringIO(string)
frame_drugbank = pd.read_csv(str_io, sep="\t", header=None)
print(frame_drugbank.shape)
frame_drugbank.head()

(766920, 3)


Unnamed: 0,0,1,2
0,<http://www4.wiwiss.fu-berlin.de/drugbank/reso...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#P...
1,<http://www4.wiwiss.fu-berlin.de/drugbank/reso...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#P...
2,<http://www4.wiwiss.fu-berlin.de/drugbank/reso...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#P...
3,<http://www4.wiwiss.fu-berlin.de/drugbank/reso...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#P...
4,<http://www4.wiwiss.fu-berlin.de/drugbank/reso...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#P...


## Concatenate and save DataFrame

In [7]:
frames = [frame_dbpedia_2016, frame_dbpedia_People, frame_drugbank]
dataset_semantic = pd.concat(frames)
export_csv = dataset_semantic.to_csv (input_path+'dbpedia_semantic.ttl', index = None, header=False, sep='\t')

In [8]:
dataset_semantic.head()

Unnamed: 0,0,1,2
0,<http://dbpedia.org/ontology/>,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://www.w3.org/2002/07/owl#Ontology> .
1,<http://dbpedia.org/ontology/>,<http://www.w3.org/1999/02/22-rdf-syntax-ns#t...,<http://purl.org/vocommons/voaf#Vocabulary> .
2,<http://dbpedia.org/ontology/>,<http://purl.org/vocab/vann/preferredNamespac...,"""dbo"" ."
3,<http://dbpedia.org/ontology/>,<http://purl.org/vocab/vann/preferredNamespac...,"""http://dbpedia.org/ontology/"" ."
4,<http://dbpedia.org/ontology/>,<http://purl.org/dc/terms/title>,"""The DBpedia Ontology""@en ."


## dbpedia_2016 statistics 

In [9]:
table_subj = frame_dbpedia_2016[0].value_counts()
table_pred = frame_dbpedia_2016[1].value_counts()
table_obj = frame_dbpedia_2016[2].value_counts()
table_subj.to_csv (input_path+'subj_statistics_dbpedia_2016.csv', header=False, sep='\t')
table_pred.to_csv (input_path+'pred_statistics_dbpedia_2016.csv', header=False, sep='\t')
table_pred.to_csv (input_path+'obj_statistics_dbpedia_2016.csv', header=False, sep='\t')

## dbpedia_People statistics

In [10]:
table_subj = frame_dbpedia_People[0].value_counts()
table_pred = frame_dbpedia_People[1].value_counts()
table_obj = frame_dbpedia_People[2].value_counts()
table_subj.to_csv (input_path+'subj_statistics_dbpedia_People.csv', header=False, sep='\t')
table_pred.to_csv (input_path+'pred_statistics_dbpedia_People.csv', header=False, sep='\t')
table_pred.to_csv (input_path+'obj_statistics_dbpedia_People.csv', header=False, sep='\t')

## drugbank statistics

In [11]:
table_subj = frame_drugbank[0].value_counts()
table_pred = frame_drugbank[1].value_counts()
table_obj = frame_drugbank[2].value_counts()
table_subj.to_csv (input_path+'subj_statistics_drugbank.csv', header=False, sep='\t')
table_pred.to_csv (input_path+'pred_statistics_drugbank.csv', header=False, sep='\t')
table_pred.to_csv (input_path+'obj_statistics_drugbank.csv', header=False, sep='\t')