In [1]:
import sys
sys.path.append('../')
from setting import config_read

In [2]:
import rdflib
import pandas as pd
from owlready2 import get_ontology



In [3]:
config = config_read('../')

# Load owl file
data_path = config['owl']['path']
onto = get_ontology(data_path).load()

g = rdflib.Graph()
g.parse(data_path)

knows_query = """
SELECT DISTINCT ?x ?y ?z
WHERE {
    ?x ?y ?z.
}"""

triple_list = []
qres = g.query(knows_query)
for row in qres:
    triple_list.append([str(row[0]), str(row[1]), str(row[2])])
len(triple_list)

3825

In [4]:
triple_text_list = []
for s,p,o in triple_list:
    s_res = onto.search_one(iri=s)
    if s_res == None : 
        s_triple = s
    else:
        s_triple = s_res

    p_res = onto.search_one(iri=p)
    if p_res == None :
        p_triple = p
    else:
        p_triple = p_res

    o_res = onto.search_one(iri=o)
    if o_res == None : 
        o_triple = o
    else:
        o_triple = o_res
    
    triple_text_list.append([s_triple, p_triple, o_triple])

In [5]:
triple_df = pd.DataFrame(triple_text_list, columns=['S','P','O'])
print('겹치는 트리플 개수 : ', sum(triple_df.duplicated(['S','P','O'])))  
triple_df = triple_df.drop_duplicates(['S','P','O']).reset_index(drop=True) # 중복제거
print('중복 제거 후 트리플 개수 : ', len(triple_df))

겹치는 트리플 개수 :  157
중복 제거 후 트리플 개수 :  3668


In [6]:
p_list = []
for op in onto.object_properties():
    p_list.append(op)

for dp in onto.data_properties():
    p_list.append(dp)

In [7]:
def literal(x):
    if type(x) == str:
        return 'Literal_'
    else:
        return x.is_a[0]

In [8]:
only_op_dp_df = triple_df[triple_df.P.isin(p_list)].reset_index(drop=True)
only_op_dp_df

Unnamed: 0,S,P,O
0,data.나_변했나봐,1.1.title,나 변했나봐
1,data.태연,schema.org.affiliation,data.SM_엔터테인먼트
2,data.유빈,data.realName,김유빈
3,data.이승철,data.hasMusicActivity,data.singer
4,data.너는_나의_봄이다,data.isSungBy,data.성시경
...,...,...,...
1000,data.소녀시대,data.debutNation,data.한국
1001,data.전인권,0.1.name,전인권
1002,data.Complete,data.isSungBy,data.소녀시대
1003,data.Bigbang_vol.1,data.label_of_album,data.YG_엔터테인먼트


In [9]:
only_op_dp_df['domain'] = only_op_dp_df['S'].apply(lambda x:x.is_a[0])
only_op_dp_df['range'] = only_op_dp_df['O'].apply(literal)

class_map_df = only_op_dp_df.astype(str)
cond = class_map_df['range'] == 'Literal_'
class_map_df.loc[cond, 'range'] += class_map_df.loc[cond, 'P']

class_map_df

Unnamed: 0,S,P,O,domain,range
0,data.나_변했나봐,1.1.title,나 변했나봐,mo.Track,Literal_1.1.title
1,data.태연,schema.org.affiliation,data.SM_엔터테인먼트,data.MusicArtist,0.1.Organization
2,data.유빈,data.realName,김유빈,0.1.Person,Literal_data.realName
3,data.이승철,data.hasMusicActivity,data.singer,data.MusicArtist,data.MusicActivity
4,data.너는_나의_봄이다,data.isSungBy,data.성시경,mo.Track,data.MusicArtist
...,...,...,...,...,...
1000,data.소녀시대,data.debutNation,data.한국,mo.MusicGroup,schema.org.Country
1001,data.전인권,0.1.name,전인권,0.1.Person,Literal_0.1.name
1002,data.Complete,data.isSungBy,data.소녀시대,mo.Track,mo.MusicGroup
1003,data.Bigbang_vol.1,data.label_of_album,data.YG_엔터테인먼트,schema.org.MusicAlbum,0.1.Organization


In [10]:
# the number of instance-level triples containing the property of the unit path
dpr_df = class_map_df.groupby(['domain', 'P', 'range'], as_index=False).S.count()

# the total number of triples from domain class to range class
dr_df = class_map_df.groupby(['domain', 'range'], as_index=False).S.count()

merged_df = pd.merge(class_map_df, dpr_df, on=['domain', 'P', 'range'], how='inner', suffixes=('','_dpr'))
merged_df = pd.merge(merged_df, dr_df, on=['domain', 'range'], how='inner', suffixes=('','_dr'))
merged_df['W'] = merged_df['S_dpr'] / merged_df['S_dr']

final_triple_df = merged_df[['domain', 'P', 'range', 'W']]
final_triple_df = final_triple_df.drop_duplicates().reset_index(drop=True)

In [11]:
final_triple_df

Unnamed: 0,domain,P,range,W
0,mo.Track,1.1.title,Literal_1.1.title,1.000000
1,data.MusicArtist,schema.org.affiliation,0.1.Organization,1.000000
2,0.1.Person,data.realName,Literal_data.realName,1.000000
3,data.MusicArtist,data.hasMusicActivity,data.MusicActivity,1.000000
4,mo.Track,data.isSungBy,data.MusicArtist,0.247525
...,...,...,...,...
91,0.1.Person,data.isArrangedBy,0.1.Person,0.500000
92,0.1.Person,data.isComposedBy,0.1.Person,0.500000
93,mo.MusicGroup,data.hasLeader,data.MusicArtist,0.750000
94,mo.MusicGroup,0.1.member,data.MusicArtist,0.250000


In [13]:
merged_df.to_csv('../unit_path_for_mapping.csv', index=False)
final_triple_df.to_csv('../unit_path.csv', index=False)