In [1]:
import sys
sys.path.append('../')
from setting import config_read

In [2]:
import rdflib
import pandas as pd
from owlready2 import get_ontology
from collections import defaultdict



In [3]:
config = config_read('../')
data_path = config['owl']['path']

g = rdflib.Graph()
g.parse(data_path)

knows_query = """
SELECT DISTINCT ?x ?y ?z
WHERE {
    ?x ?y ?z.
}"""

triple_list = []
qres = g.query(knows_query)
for row in qres:
    triple_list.append([str(row[0]), str(row[1]), str(row[2])])
len(triple_list)

3825

In [4]:
prefix2uri = {"schema" : "http://schema.org/" ,
"dcterms" : "http://purl.org/dc/terms/" ,
"foaf" : "http://xmlns.com/foaf/0.1/" ,
"time" : "http://www.w3.org/2006/time#" ,
"mo" : "http://purl.org/ontology/mo/" ,
"skmo" : "http://www.sktelecom.com/skmo/" ,
"bibo" : "http://purl.org/ontology/bibo/" ,
"owl" : "http://www.w3.org/2002/07/owl#" ,
"skpo" : "http://www.sktelecom.com/skpo/" ,
"dc" : "http://purl.org/dc/elements/1.1/" ,
"xsd" : "http://www.w3.org/2001/XMLSchema#" ,
"j.0" : "http://www.sktelecom.com/timeUnit#" ,
"time-entry" : "http://www.w3.org/2006/time-entry#" ,
"event" : "http://purl.org/NET/c4dm/event.owl#" ,
"skos" : "http://www.w3.org/2004/02/skos/core#" ,
"foaf1" : "http://xmlns.com/foaf/0.1/foaf:foaf:" ,
"rdfs" : "http://www.w3.org/2000/01/rdf-schema#" ,
"tl" : "http://purl.org/NET/c4dm/timeline.owl#" ,
"rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
"vs" : "http://www.w3.org/2003/06/sw-vocab-status/ns#"}

uri2prefix  = {v:k for k,v in prefix2uri.items()}

In [5]:
triple_list2 = []
triple_text_list = []
for s,p,o in triple_list:
    
    s_colon, p_colon, o_colon = s, p, o
    s_triple, p_triple, o_triple = s, p, o
    
    for k in uri2prefix:
        if s.startswith(k):
            s_colon = s.replace(k, uri2prefix[k]+':')
            s_triple = s.replace(k, "")
        if p.startswith(k):
            p_colon = p.replace(k, uri2prefix[k]+':')
            p_triple = p.replace(k, "")
        if o.startswith(k):
            o_colon = o.replace(k, uri2prefix[k]+':')
            o_triple = o.replace(k, "")
        
    triple_list2.append([s_colon, p_colon, o_colon]) # 이중리스트
    triple_text_list.append([s_triple, p_triple, o_triple])
    # triple_text_list.append(', '.join([s_triple, p_triple, o_triple])) # 단일리스트

In [6]:
triple_df = pd.DataFrame(triple_text_list, columns=['S','P','O'])
print('겹치는 트리플 개수 : ', sum(triple_df.duplicated(['S','P','O'])))  
triple_df = triple_df.drop_duplicates(['S','P','O']).reset_index(drop=True) # 중복제거
print('중복 제거 후 트리플 개수 : ', len(triple_df))

겹치는 트리플 개수 :  159
중복 제거 후 트리플 개수 :  3666


In [7]:
# Load owl file
onto = get_ontology(data_path).load()

In [8]:
p_list = []
for op in onto.object_properties():
    s = op.iri
    for k in uri2prefix:
        if s.startswith(k):
            new_s = s.replace(k, "")
    p_list.append(new_s)

for dp in onto.data_properties():
    s = dp.iri
    for k in uri2prefix:
        if s.startswith(k):
            new_s = s.replace(k, "")
    p_list.append(new_s)   

str2class = defaultdict(lambda:'')
for i in onto.individuals():
    s = i.iri
    for k in uri2prefix:
        if s.startswith(k):
            new_s = s.replace(k, "")
    str2class[new_s] = i.is_a

In [10]:
domain_triple_df = triple_df[triple_df.P.isin(p_list)]

domain_triple_df = domain_triple_df.reset_index(drop=True)
domain_triple_df.loc[:, 'domain'] = domain_triple_df.S.apply(lambda x:str2class[x])
domain_triple_df = domain_triple_df.explode(['domain'])

domain_triple_df.loc[:, 'range'] = domain_triple_df.O.apply(lambda x:str2class[x])

literal = domain_triple_df['range'] == ''
domain_triple_df.loc[literal, 'range'] = 'Literal_' + domain_triple_df[literal]['P']
range_triple_df = domain_triple_df.explode(['range'])
range_triple_df = range_triple_df.reset_index(drop=True).astype(str)
range_triple_df

Unnamed: 0,S,P,O,domain,range
0,민아,realName,권민아,C:\Users\USER\Desktop\repository\text2sparql\d...,Literal_realName
1,민아,realName,권민아,0.1.Person,Literal_realName
2,거리에서,hasGenre,ost,mo.Track,C:\Users\USER\Desktop\repository\text2sparql\d...
3,I_Feat_버벌진트,title,I,mo.Track,schema.org.MusicAlbum
4,20160601_가온차트_1위,rankDate,2016-07-06,C:\Users\USER\Desktop\repository\text2sparql\d...,Literal_rankDate
...,...,...,...,...,...
1488,time_is_running_out,isSungBy,뮤즈,mo.Track,mo.MusicGroup
1489,Rain,hasSinger,태연,schema.org.MusicAlbum,C:\Users\USER\Desktop\repository\text2sparql\d...
1490,Rain,hasSinger,태연,schema.org.MusicAlbum,0.1.Person
1491,YG_엔터테인먼트,name,YG,0.1.Organization,Literal_name


In [12]:
dpr_df = range_triple_df.groupby(['domain', 'P', 'range'], as_index=False).S.count()
dr_df = range_triple_df.groupby(['domain', 'range'], as_index=False).S.count()

merged_triple_df = pd.merge(range_triple_df, dpr_df, on=['domain', 'P', 'range'], how='inner', suffixes=('','_dpr'))
merged_triple_df = pd.merge(merged_triple_df, dr_df, on=['domain', 'range'], how='inner', suffixes=('','_dr'))
merged_triple_df['W'] = merged_triple_df['S_dpr'] / merged_triple_df['S_dr']
final_triple_df = merged_triple_df[['domain', 'P', 'range', 'W']]
final_triple_df = final_triple_df.drop_duplicates().reset_index(drop=True)

In [13]:
final_triple_df

Unnamed: 0,domain,P,range,W
0,C:\Users\USER\Desktop\repository\text2sparql\d...,realName,Literal_realName,1.0
1,0.1.Person,realName,Literal_realName,1.0
2,mo.Track,hasGenre,C:\Users\USER\Desktop\repository\text2sparql\d...,1.0
3,mo.Track,title,schema.org.MusicAlbum,0.4
4,mo.Track,inAlbum,schema.org.MusicAlbum,0.6
...,...,...,...,...
129,C:\Users\USER\Desktop\repository\text2sparql\d...,title,Literal_title,1.0
130,mo.Track,isArrangedBy,0.1.Organization,0.5
131,mo.Track,isComposedBy,0.1.Organization,0.5
132,mo.Track,isSungBy,owl.Thing,1.0


In [14]:
final_triple_df.to_csv('../unit_path.csv', index=False)