In [1]:
import sys
sys.path.append('../')
from setting import config_read

In [2]:
import rdflib
import pandas as pd
from owlready2 import get_ontology





In [3]:
config = config_read('../')

# Load owl file
data_path = config['owl']['path']
onto = get_ontology(data_path).load()

g = rdflib.Graph()
g.parse(data_path)

knows_query = """
SELECT DISTINCT ?x ?y ?z
WHERE {
    ?x ?y ?z.
}"""

triple_list = []
qres = g.query(knows_query)
for row in qres:
    triple_list.append([str(row[0]), str(row[1]), str(row[2])])
len(triple_list)

3825

In [4]:
triple_text_list = []
for s,p,o in triple_list:
    s_res = onto.search_one(iri=s)
    if s_res == None : 
        s_triple = s
    else:
        s_triple = s_res

    p_res = onto.search_one(iri=p)
    if p_res == None :
        p_triple = p
    else:
        p_triple = p_res

    o_res = onto.search_one(iri=o)
    if o_res == None : 
        o_triple = o
    else:
        o_triple = o_res
    
    triple_text_list.append([s_triple, p_triple, o_triple])

In [5]:
triple_df = pd.DataFrame(triple_text_list, columns=['S','P','O'])
print('겹치는 트리플 개수 : ', sum(triple_df.duplicated(['S','P','O'])))  
triple_df = triple_df.drop_duplicates(['S','P','O']).reset_index(drop=True) # 중복제거
print('중복 제거 후 트리플 개수 : ', len(triple_df))

겹치는 트리플 개수 :  157
중복 제거 후 트리플 개수 :  3668


In [6]:
p_list = list(onto.object_properties())
p_list.extend(list(onto.data_properties()))
only_op_dp_df = triple_df[triple_df.P.isin(p_list)].reset_index(drop=True)
only_op_dp_df

Unnamed: 0,S,P,O
0,SKMO_v2.4_0223.7989_(강타&태연),SKMO_v2.4_0223.hasGenre,SKMO_v2.4_0223.teen_pop
1,SKMO_v2.4_0223.7989_(강타&태연),SKMO_v2.4_0223.isComposedBy,SKMO_v2.4_0223.송재준
2,SKMO_v2.4_0223.여자친구,dc.title,여자친구
3,SKMO_v2.4_0223.SQUARE_TWO,SKMO_v2.4_0223.hasTrack,SKMO_v2.4_0223.불장난
4,SKMO_v2.4_0223.휘파람,SKMO_v2.4_0223.isSungBy,SKMO_v2.4_0223.블랙핑크
...,...,...,...
1000,SKMO_v2.4_0223.로제,SKMO_v2.4_0223.debutDate,2016-08-14
1001,SKMO_v2.4_0223.Ooh_La-La!,SKMO_v2.4_0223.playTime,234
1002,SKMO_v2.4_0223.선미,SKMO_v2.4_0223.realName,이선미
1003,SKMO_v2.4_0223.신혁,SKMO_v2.4_0223.hasMusicActivity,SKMO_v2.4_0223.composer


In [7]:
def literal(x):
    if type(x) == str:
        return 'Literal_'
    else:
        return x.is_a

In [8]:
only_op_dp_df['domain'] = only_op_dp_df['S'].apply(lambda x:x.is_a)
only_op_dp_df['range'] = only_op_dp_df['O'].apply(literal)

only_op_dp_df = only_op_dp_df.explode(['domain'])
only_op_dp_df = only_op_dp_df.explode(['range'])

class_map_df = only_op_dp_df.astype(str)
cond = class_map_df['range'] == 'Literal_'
class_map_df.loc[cond, 'range'] += class_map_df.loc[cond, 'P']

class_map_df

Unnamed: 0,S,P,O,domain,range
0,SKMO_v2.4_0223.7989_(강타&태연),SKMO_v2.4_0223.hasGenre,SKMO_v2.4_0223.teen_pop,mo.Track,SKMO_v2.4_0223.Genre
1,SKMO_v2.4_0223.7989_(강타&태연),SKMO_v2.4_0223.isComposedBy,SKMO_v2.4_0223.송재준,mo.Track,SKMO_v2.4_0223.MusicArtist
1,SKMO_v2.4_0223.7989_(강타&태연),SKMO_v2.4_0223.isComposedBy,SKMO_v2.4_0223.송재준,mo.Track,foaf.Person
2,SKMO_v2.4_0223.여자친구,dc.title,여자친구,mo.Track,Literal_dc.title
3,SKMO_v2.4_0223.SQUARE_TWO,SKMO_v2.4_0223.hasTrack,SKMO_v2.4_0223.불장난,schema.MusicAlbum,mo.Track
...,...,...,...,...,...
1001,SKMO_v2.4_0223.Ooh_La-La!,SKMO_v2.4_0223.playTime,234,mo.Track,Literal_SKMO_v2.4_0223.playTime
1002,SKMO_v2.4_0223.선미,SKMO_v2.4_0223.realName,이선미,foaf.Person,Literal_SKMO_v2.4_0223.realName
1003,SKMO_v2.4_0223.신혁,SKMO_v2.4_0223.hasMusicActivity,SKMO_v2.4_0223.composer,SKMO_v2.4_0223.MusicArtist,SKMO_v2.4_0223.MusicActivity
1003,SKMO_v2.4_0223.신혁,SKMO_v2.4_0223.hasMusicActivity,SKMO_v2.4_0223.composer,foaf.Person,SKMO_v2.4_0223.MusicActivity


In [9]:
# the number of instance-level triples containing the property of the unit path
dpr_df = class_map_df.groupby(['domain', 'P', 'range'], as_index=False).S.count()

# the total number of triples from domain class to range class
dr_df = class_map_df.groupby(['domain', 'range'], as_index=False).S.count()

merged_df = pd.merge(class_map_df, dpr_df, on=['domain', 'P', 'range'], how='inner', suffixes=('','_dpr'))
merged_df = pd.merge(merged_df, dr_df, on=['domain', 'range'], how='inner', suffixes=('','_dr'))
merged_df['W'] = 1 - (merged_df['S_dpr'] / merged_df['S_dr'])

final_triple_df = merged_df[['domain', 'P', 'range', 'W']]
merged_df = merged_df[['S', 'P', 'O', 'domain', 'range']]
final_triple_df = final_triple_df.drop_duplicates().reset_index(drop=True)

In [10]:
final_triple_df

Unnamed: 0,domain,P,range,W
0,mo.Track,SKMO_v2.4_0223.hasGenre,SKMO_v2.4_0223.Genre,0.000000
1,mo.Track,SKMO_v2.4_0223.isComposedBy,SKMO_v2.4_0223.MusicArtist,0.838710
2,mo.Track,SKMO_v2.4_0223.isSungBy,SKMO_v2.4_0223.MusicArtist,0.612903
3,mo.Track,SKMO_v2.4_0223.isArrangedBy,SKMO_v2.4_0223.MusicArtist,0.846774
4,mo.Track,SKMO_v2.4_0223.isWrittenBy,SKMO_v2.4_0223.MusicArtist,0.741935
...,...,...,...,...
107,SKMO_v2.4_0223.Instrumental,SKMO_v2.4_0223.isSungBy,SKMO_v2.4_0223.MusicArtist,0.000000
108,SKMO_v2.4_0223.MusicArtist,foaf.nick,Literal_foaf.nick,0.000000
109,mo.Track,SKMO_v2.4_0223.lyrics,Literal_SKMO_v2.4_0223.lyrics,0.000000
110,SKMO_v2.4_0223.MusicArtist,SKMO_v2.4_0223.hasPosition,SKMO_v2.4_0223.GroupPosition,0.000000


In [11]:
merged_df.to_csv('../unit_path_for_mapping.csv', index=False)
final_triple_df.to_csv('../unit_path.csv', index=False)