In [1]:
import sys
sys.path.append('../')
from setting import config_read

In [2]:
import rdflib
import pandas as pd
from owlready2 import get_ontology
from collections import defaultdict



In [3]:
config = config_read('../')
data_path = config['owl']['path']

g = rdflib.Graph()
g.parse(data_path)

knows_query = """
SELECT DISTINCT ?x ?y ?z
WHERE {
    ?x ?y ?z.
}"""

triple_list = []
qres = g.query(knows_query)
for row in qres:
    triple_list.append([str(row[0]), str(row[1]), str(row[2])])
len(triple_list)

3825

In [4]:
prefix2uri = {"schema" : "http://schema.org/" ,
"dcterms" : "http://purl.org/dc/terms/" ,
"foaf" : "http://xmlns.com/foaf/0.1/" ,
"time" : "http://www.w3.org/2006/time#" ,
"mo" : "http://purl.org/ontology/mo/" ,
"skmo" : "http://www.sktelecom.com/skmo/" ,
"bibo" : "http://purl.org/ontology/bibo/" ,
"owl" : "http://www.w3.org/2002/07/owl#" ,
"skpo" : "http://www.sktelecom.com/skpo/" ,
"dc" : "http://purl.org/dc/elements/1.1/" ,
"xsd" : "http://www.w3.org/2001/XMLSchema#" ,
"j.0" : "http://www.sktelecom.com/timeUnit#" ,
"time-entry" : "http://www.w3.org/2006/time-entry#" ,
"event" : "http://purl.org/NET/c4dm/event.owl#" ,
"skos" : "http://www.w3.org/2004/02/skos/core#" ,
"foaf1" : "http://xmlns.com/foaf/0.1/foaf:foaf:" ,
"rdfs" : "http://www.w3.org/2000/01/rdf-schema#" ,
"tl" : "http://purl.org/NET/c4dm/timeline.owl#" ,
"rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
"vs" : "http://www.w3.org/2003/06/sw-vocab-status/ns#"}

uri2prefix  = {v:k for k,v in prefix2uri.items()}

In [5]:
triple_list2 = []
triple_text_list = []
for s,p,o in triple_list:
    
    s_colon, p_colon, o_colon = s, p, o
    s_triple, p_triple, o_triple = s, p, o
    
    for k in uri2prefix:
        if s.startswith(k):
            s_colon = s.replace(k, uri2prefix[k]+':')
            s_triple = s.replace(k, "")
        if p.startswith(k):
            p_colon = p.replace(k, uri2prefix[k]+':')
            p_triple = p.replace(k, "")
        if o.startswith(k):
            o_colon = o.replace(k, uri2prefix[k]+':')
            o_triple = o.replace(k, "")
        
    triple_list2.append([s_colon, p_colon, o_colon]) # 이중리스트
    triple_text_list.append([s_triple, p_triple, o_triple])
    # triple_text_list.append(', '.join([s_triple, p_triple, o_triple])) # 단일리스트

In [6]:
triple_df = pd.DataFrame(triple_text_list, columns=['S','P','O'])
print('겹치는 트리플 개수 : ', sum(triple_df.duplicated(['S','P','O'])))  
triple_df = triple_df.drop_duplicates(['S','P','O']).reset_index(drop=True) # 중복제거
print('중복 제거 후 트리플 개수 : ', len(triple_df))

겹치는 트리플 개수 :  159
중복 제거 후 트리플 개수 :  3666


In [7]:
# Load owl file
onto = get_ontology(data_path).load()

In [8]:
p_list = []
for op in onto.object_properties():
    s = op.iri
    for k in uri2prefix:
        if s.startswith(k):
            new_s = s.replace(k, "")
    p_list.append(new_s)

for dp in onto.data_properties():
    s = dp.iri
    for k in uri2prefix:
        if s.startswith(k):
            new_s = s.replace(k, "")
    p_list.append(new_s)   

str2class = defaultdict(lambda:'')
for i in onto.individuals():
    s = i.iri
    for k in uri2prefix:
        if s.startswith(k):
            new_s = s.replace(k, "")
    str2class[new_s] = i.is_a

In [9]:
# Remain only data properties and object properties
only_dp_op_df = triple_df[triple_df.P.isin(p_list)]
only_dp_op_df = only_dp_op_df.reset_index(drop=True)
only_dp_op_df

Unnamed: 0,S,P,O
0,신혁,hasMusicActivity,composer
1,Merry-go-round,playTime,193
2,Starlight,isSungBy,태연
3,찬미,realName,김찬미
4,원더걸스,groupType,걸그룹
...,...,...,...
1000,최성원,hasMusicActivity,base
1001,대성,hasMusicActivity,singer
1002,인형의_꿈,isSungBy,일기예보
1003,민아,hasMusicActivity,singer


In [10]:
# resource mapping for domain instances
only_dp_op_df.loc[:, 'domain'] = only_dp_op_df.S.apply(lambda x:str2class[x])
res_map_df = only_dp_op_df.explode(['domain'])

# resource mapping for range instances
res_map_df.loc[:, 'range'] = res_map_df.O.apply(lambda x:str2class[x])
literal = res_map_df['range'] == ''
res_map_df.loc[literal, 'range'] = 'Literal_' + res_map_df[literal]['P']
res_map_df = res_map_df.explode(['range'])

res_map_df = res_map_df.reset_index(drop=True).astype(str)
res_map_df

Unnamed: 0,S,P,O,domain,range
0,신혁,hasMusicActivity,composer,C:\Users\USER\Desktop\repository\text2sparql\d...,C:\Users\USER\Desktop\repository\text2sparql\d...
1,신혁,hasMusicActivity,composer,0.1.Person,C:\Users\USER\Desktop\repository\text2sparql\d...
2,Merry-go-round,playTime,193,mo.Track,Literal_playTime
3,Starlight,isSungBy,태연,mo.Track,C:\Users\USER\Desktop\repository\text2sparql\d...
4,Starlight,isSungBy,태연,mo.Track,0.1.Person
...,...,...,...,...,...
1488,대성,hasMusicActivity,singer,0.1.Person,C:\Users\USER\Desktop\repository\text2sparql\d...
1489,인형의_꿈,isSungBy,일기예보,mo.Track,mo.MusicGroup
1490,민아,hasMusicActivity,singer,C:\Users\USER\Desktop\repository\text2sparql\d...,C:\Users\USER\Desktop\repository\text2sparql\d...
1491,민아,hasMusicActivity,singer,0.1.Person,C:\Users\USER\Desktop\repository\text2sparql\d...


In [11]:
# the number of instance-level triples containing the property of the unit path
dpr_df = res_map_df.groupby(['domain', 'P', 'range'], as_index=False).S.count()

# the total number of triples from domain class to range class
dr_df = res_map_df.groupby(['domain', 'range'], as_index=False).S.count()

merged_df = pd.merge(res_map_df, dpr_df, on=['domain', 'P', 'range'], how='inner', suffixes=('','_dpr'))
merged_df = pd.merge(merged_df, dr_df, on=['domain', 'range'], how='inner', suffixes=('','_dr'))
merged_df['W'] = merged_df['S_dpr'] / merged_df['S_dr']

final_triple_df = merged_df[['domain', 'P', 'range', 'W']]
final_triple_df = final_triple_df.drop_duplicates().reset_index(drop=True)

In [12]:
final_triple_df

Unnamed: 0,domain,P,range,W
0,C:\Users\USER\Desktop\repository\text2sparql\d...,hasMusicActivity,C:\Users\USER\Desktop\repository\text2sparql\d...,1.000
1,0.1.Person,hasMusicActivity,C:\Users\USER\Desktop\repository\text2sparql\d...,1.000
2,mo.Track,playTime,Literal_playTime,1.000
3,mo.Track,isSungBy,C:\Users\USER\Desktop\repository\text2sparql\d...,0.384
4,mo.Track,isWrittenBy,C:\Users\USER\Desktop\repository\text2sparql\d...,0.256
...,...,...,...,...
129,schema.org.MusicAlbum,title,schema.org.MusicAlbum,1.000
130,schema.org.MusicAlbum,title,Literal_title,1.000
131,mo.Track,lyrics,Literal_lyrics,1.000
132,C:\Users\USER\Desktop\repository\text2sparql\d...,isSungBy,mo.MusicGroup,1.000


In [13]:
final_triple_df.to_csv('../unit_path.csv', index=False)