In [1]:
import sys
sys.path.append('../')
from setting import config_read

In [2]:
import rdflib
import pandas as pd
from owlready2 import get_ontology



In [3]:
config = config_read('../')

In [4]:
data_path = config['owl']['path']

g = rdflib.Graph()
g.parse(data_path)

knows_query = """
SELECT DISTINCT ?x ?y ?z
WHERE {
    ?x ?y ?z.
}"""

triple_list = []
qres = g.query(knows_query)
for row in qres:
    triple_list.append([str(row[0]), str(row[1]), str(row[2])])
len(triple_list)

3825

In [5]:
prefix2uri = {"schema" : "http://schema.org/" ,
"dcterms" : "http://purl.org/dc/terms/" ,
"foaf" : "http://xmlns.com/foaf/0.1/" ,
"time" : "http://www.w3.org/2006/time#" ,
"mo" : "http://purl.org/ontology/mo/" ,
"skmo" : "http://www.sktelecom.com/skmo/" ,
"bibo" : "http://purl.org/ontology/bibo/" ,
"owl" : "http://www.w3.org/2002/07/owl#" ,
"skpo" : "http://www.sktelecom.com/skpo/" ,
"dc" : "http://purl.org/dc/elements/1.1/" ,
"xsd" : "http://www.w3.org/2001/XMLSchema#" ,
"j.0" : "http://www.sktelecom.com/timeUnit#" ,
"time-entry" : "http://www.w3.org/2006/time-entry#" ,
"event" : "http://purl.org/NET/c4dm/event.owl#" ,
"skos" : "http://www.w3.org/2004/02/skos/core#" ,
"foaf1" : "http://xmlns.com/foaf/0.1/foaf:foaf:" ,
"rdfs" : "http://www.w3.org/2000/01/rdf-schema#" ,
"tl" : "http://purl.org/NET/c4dm/timeline.owl#" ,
"rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#" ,
"vs" : "http://www.w3.org/2003/06/sw-vocab-status/ns#"}

uri2prefix  = {v:k for k,v in prefix2uri.items()}

In [6]:
triple_list2 = []
triple_text_list = []
for s,p,o in triple_list:
    
    s_colon, p_colon, o_colon = s, p, o
    s_triple, p_triple, o_triple = s, p, o
    
    for k in uri2prefix:
        if s.startswith(k):
            s_colon = s.replace(k, uri2prefix[k]+':')
            s_triple = s.replace(k, "")
        if p.startswith(k):
            p_colon = p.replace(k, uri2prefix[k]+':')
            p_triple = p.replace(k, "")
        if o.startswith(k):
            o_colon = o.replace(k, uri2prefix[k]+':')
            o_triple = o.replace(k, "")
        
    triple_list2.append([s_colon, p_colon, o_colon]) # 이중리스트
    triple_text_list.append([s_triple, p_triple, o_triple])
    # triple_text_list.append(', '.join([s_triple, p_triple, o_triple])) # 단일리스트

In [7]:
triple_df = pd.DataFrame(triple_text_list, columns=['S','P','O'])
print('겹치는 트리플 개수 : ', sum(triple_df.duplicated(['S','P','O'])))  
triple_df = triple_df.drop_duplicates(['S','P','O']).reset_index(drop=True) # 중복제거
print('중복 제거 후 트리플 개수 : ', len(triple_df))

겹치는 트리플 개수 :  159
중복 제거 후 트리플 개수 :  3666


In [8]:
# Load owl file
onto = get_ontology(data_path).load()

In [9]:
p_list = []
for op in onto.object_properties():
    s = op.iri
    for k in uri2prefix:
        if s.startswith(k):
            new_s = s.replace(k, "")
    p_list.append(new_s)

for dp in onto.data_properties():
    s = dp.iri
    for k in uri2prefix:
        if s.startswith(k):
            new_s = s.replace(k, "")
    p_list.append(new_s)   

str2class = {}
for i in onto.individuals():
    s = i.iri
    for k in uri2prefix:
        if s.startswith(k):
            new_s = s.replace(k, "")
    str2class[new_s] = i.is_a

In [10]:
domain_triple_df = triple_df[triple_df.P.isin(p_list)]

domain_triple_df = domain_triple_df.reset_index(drop=True)
domain_triple_df.loc[:, 'domain'] = domain_triple_df.S.apply(lambda x:str2class[x])
domain_triple_df = domain_triple_df.explode(['domain'])

domain_triple_df['range'] = ''
no_literal = domain_triple_df['O'].isin(str2class.keys())

domain_triple_df.loc[no_literal, 'range'] = [str2class[i] for i in domain_triple_df[no_literal]['O']]
domain_triple_df.loc[~no_literal, 'range'] = 'Literal_' + domain_triple_df[~no_literal]['P']
range_triple_df = domain_triple_df.explode(['range'])
range_triple_df = range_triple_df.reset_index(drop=True).astype(str)
range_triple_df

Unnamed: 0,S,P,O,domain,range
0,성미현,affiliation,YG_엔터테인먼트,0.1.Person,0.1.Organization
1,Teddy,name,테디,0.1.Person,Literal_name
2,Bigbang_vol.1,hasTrack,눈물뿐인_바보,schema.org.MusicAlbum,mo.Track
3,SQUARE_ONE,hasSinger,블랙핑크,schema.org.MusicAlbum,mo.MusicGroup
4,뱅뱅뱅_황치열,title,뱅뱅뱅,mo.Track,mo.Track
...,...,...,...,...,...
1488,레인,datePublished,2016-02-03,mo.Track,Literal_datePublished
1489,Kissing_You,isArrangedBy,이재명,mo.Track,C:\Users\USER\Desktop\repository\text2sqarql\d...
1490,Kissing_You,isArrangedBy,이재명,mo.Track,0.1.Person
1491,태연,birthday,1989-03-09,C:\Users\USER\Desktop\repository\text2sqarql\d...,Literal_birthday


In [11]:
dpr_df = range_triple_df.groupby(['domain', 'P', 'range'], as_index=False).S.count()
dr_df = range_triple_df.groupby(['domain', 'range'], as_index=False).S.count()

merged_triple_df = pd.merge(range_triple_df, dpr_df, on=['domain', 'P', 'range'], how='inner', suffixes=('','_dpr'))
merged_triple_df = pd.merge(merged_triple_df, dr_df, on=['domain', 'range'], how='inner', suffixes=('','_dr'))
merged_triple_df['W'] = merged_triple_df['S_dpr'] / merged_triple_df['S_dr']
final_triple_df = merged_triple_df[['domain', 'P', 'range', 'W']]
final_triple_df = final_triple_df.drop_duplicates().reset_index(drop=True)

In [12]:
final_triple_df

Unnamed: 0,domain,P,range,W
0,0.1.Person,affiliation,0.1.Organization,1.000000
1,0.1.Person,name,Literal_name,1.000000
2,schema.org.MusicAlbum,hasTrack,mo.Track,0.925926
3,schema.org.MusicAlbum,titleTrack,mo.Track,0.037037
4,schema.org.MusicAlbum,title,mo.Track,0.037037
...,...,...,...,...
129,C:\Users\USER\Desktop\repository\text2sqarql\d...,rank,Literal_rank,1.000000
130,C:\Users\USER\Desktop\repository\text2sqarql\d...,hasGenre,C:\Users\USER\Desktop\repository\text2sqarql\d...,1.000000
131,mo.Track,isArrangedBy,0.1.Organization,0.500000
132,mo.Track,isComposedBy,0.1.Organization,0.500000


In [13]:
final_triple_df.to_csv('../unit_path.csv', index=False)