In [7]:
import sys
sys.path.append('../')
from setting import config_read

In [8]:
import rdflib
import pandas as pd
from owlready2 import get_ontology

In [9]:
config = config_read('../')

# Load owl file
data_path = config['owl']['path']
onto = get_ontology(data_path).load()

g = rdflib.Graph()
g.parse(data_path)

knows_query = """
SELECT DISTINCT ?x ?y ?z
WHERE {
    ?x ?y ?z.
}"""

triple_list = []
qres = g.query(knows_query)
for row in qres:
    triple_list.append([str(row[0]), str(row[1]), str(row[2])])
len(triple_list)

3825

In [10]:
triple_text_list = []
for s,p,o in triple_list:
    s_res = onto.search_one(iri=s)
    if s_res == None : 
        s_triple = s
    else:
        s_triple = s_res

    p_res = onto.search_one(iri=p)
    if p_res == None :
        p_triple = p
    else:
        p_triple = p_res

    o_res = onto.search_one(iri=o)
    if o_res == None : 
        o_triple = o
    else:
        o_triple = o_res
    
    triple_text_list.append([s_triple, p_triple, o_triple])

In [11]:
triple_df = pd.DataFrame(triple_text_list, columns=['S','P','O'])
print('겹치는 트리플 개수 : ', sum(triple_df.duplicated(['S','P','O'])))  
triple_df = triple_df.drop_duplicates(['S','P','O']).reset_index(drop=True) # 중복제거
print('중복 제거 후 트리플 개수 : ', len(triple_df))

겹치는 트리플 개수 :  157
중복 제거 후 트리플 개수 :  3668


In [12]:
p_list = list(onto.object_properties())
p_list.extend(list(onto.data_properties()))
only_op_dp_df = triple_df[triple_df.P.isin(p_list)].reset_index(drop=True)
only_op_dp_df

Unnamed: 0,S,P,O
0,SKMO_v2.4_0223.Rush,dc.title,러쉬
1,SKMO_v2.4_0223.20080410_Mnet_엠카운트다운,SKMO_v2.4_0223.rankBy,SKMO_v2.4_0223.엠카운트다운
2,SKMO_v2.4_0223.승리,foaf.birthday,1990-12-12
3,SKMO_v2.4_0223.This_Love_G_dragon_Solo,SKMO_v2.4_0223.isArrangedBy,SKMO_v2.4_0223.지드래곤
4,SKMO_v2.4_0223.나_변했나봐,dc.title,나 변했나봐
...,...,...,...
1000,SKMO_v2.4_0223.AOA,schema.affiliation,SKMO_v2.4_0223.FNC엔터테인먼트
1001,SKMO_v2.4_0223.스위티,schema.affiliation,SKMO_v2.4_0223.YG_엔터테인먼트
1002,SKMO_v2.4_0223.YG_엔터테인먼트,foaf.name,YG
1003,SKMO_v2.4_0223.SQUARE_ONE,SKMO_v2.4_0223.hasTrack,SKMO_v2.4_0223.휘파람


In [13]:
def literal(x):
    if type(x) == str:
        return 'Literal_'
    else:
        return x.is_a

In [14]:
only_op_dp_df['domain'] = only_op_dp_df['S'].apply(lambda x:x.is_a)
only_op_dp_df['range'] = only_op_dp_df['O'].apply(literal)

only_op_dp_df = only_op_dp_df.explode(['domain'])
only_op_dp_df = only_op_dp_df.explode(['range'])

class_map_df = only_op_dp_df.astype(str)
cond = class_map_df['range'] == 'Literal_'
class_map_df.loc[cond, 'range'] += class_map_df.loc[cond, 'P']

class_map_df

Unnamed: 0,S,P,O,domain,range
0,SKMO_v2.4_0223.Rush,dc.title,러쉬,mo.Track,Literal_dc.title
1,SKMO_v2.4_0223.20080410_Mnet_엠카운트다운,SKMO_v2.4_0223.rankBy,SKMO_v2.4_0223.엠카운트다운,SKMO_v2.4_0223.Ranking,SKMO_v2.4_0223.Chart
2,SKMO_v2.4_0223.승리,foaf.birthday,1990-12-12,SKMO_v2.4_0223.MusicArtist,Literal_foaf.birthday
2,SKMO_v2.4_0223.승리,foaf.birthday,1990-12-12,foaf.Person,Literal_foaf.birthday
3,SKMO_v2.4_0223.This_Love_G_dragon_Solo,SKMO_v2.4_0223.isArrangedBy,SKMO_v2.4_0223.지드래곤,mo.Track,SKMO_v2.4_0223.MusicArtist
...,...,...,...,...,...
1001,SKMO_v2.4_0223.스위티,schema.affiliation,SKMO_v2.4_0223.YG_엔터테인먼트,mo.MusicGroup,foaf.Organization
1002,SKMO_v2.4_0223.YG_엔터테인먼트,foaf.name,YG,foaf.Organization,Literal_foaf.name
1003,SKMO_v2.4_0223.SQUARE_ONE,SKMO_v2.4_0223.hasTrack,SKMO_v2.4_0223.휘파람,schema.MusicAlbum,mo.Track
1004,SKMO_v2.4_0223.La_La_La,SKMO_v2.4_0223.isSungBy,SKMO_v2.4_0223.빅뱅,mo.Track,mo.MusicGroup


In [15]:
# the number of instance-level triples containing the property of the unit path
dpr_df = class_map_df.groupby(['domain', 'P', 'range'], as_index=False).S.count()

# the total number of triples from domain class to range class
dr_df = class_map_df.groupby(['domain', 'range'], as_index=False).S.count()

merged_df = pd.merge(class_map_df, dpr_df, on=['domain', 'P', 'range'], how='inner', suffixes=('','_dpr'))
merged_df = pd.merge(merged_df, dr_df, on=['domain', 'range'], how='inner', suffixes=('','_dr'))
merged_df['W'] = merged_df['S_dpr'] / merged_df['S_dr']

final_triple_df = merged_df[['domain', 'P', 'range', 'W']]
final_triple_df = final_triple_df.drop_duplicates().reset_index(drop=True)

In [16]:
final_triple_df

Unnamed: 0,domain,P,range,W
0,mo.Track,dc.title,Literal_dc.title,1.000000
1,SKMO_v2.4_0223.Ranking,SKMO_v2.4_0223.rankBy,SKMO_v2.4_0223.Chart,1.000000
2,SKMO_v2.4_0223.MusicArtist,foaf.birthday,Literal_foaf.birthday,1.000000
3,foaf.Person,foaf.birthday,Literal_foaf.birthday,1.000000
4,mo.Track,SKMO_v2.4_0223.isArrangedBy,SKMO_v2.4_0223.MusicArtist,0.153226
...,...,...,...,...
107,foaf.Person,SKMO_v2.4_0223.isComposedBy,foaf.Person,0.500000
108,foaf.Person,SKMO_v2.4_0223.isArrangedBy,foaf.Person,0.500000
109,schema.MusicAlbum,SKMO_v2.4_0223.playTime,Literal_SKMO_v2.4_0223.playTime,1.000000
110,mo.MusicGroup,SKMO_v2.4_0223.debutTrack,mo.Track,1.000000


In [17]:
merged_df.to_csv('../unit_path_for_mapping.csv', index=False)
final_triple_df.to_csv('../unit_path.csv', index=False)