In [224]:
import requests 
import os
import pickle
import time
import pandas as pd
import scrapy
from scrapy.http import TextResponse

In [3]:
def pickle_out(name, obj, default_path='./sub_result/'):
    pickle_out = open(default_path + name + '.pickle','wb')
    pickle.dump(obj, pickle_out)
    pickle_out.close()
    return

def pickle_in(name, default_path='./sub_result/'):
    pickle_in = open(default_path + name + '.pickle','rb')
    obj = pickle.load(pickle_in)
    pickle_in.close()
    return obj

In [313]:
babelnet_dict = pickle_in('babelnet_dict')
merged_text_dict = pickle_in('merged_text_dict')
desc_dict = pickle_in('desc_dict')
tags_dict = pickle_in('tags_dict')

In [314]:
triples_list = []
for key in list(babelnet_dict.keys()):
    # desc and tags
    keyword_or_tag_relation = None
    if key in tags_dict:
        tags = tags_dict[key]
        desc_end = desc_dict[key][-10::]
        desc_end_idx = merged_text_dict[key].find(desc_end) + 9
        data_list = babelnet_dict[key].json()
        for item in data_list:
            char_start = item['charFragment']['start']
            char_end = item['charFragment']['end']
            word = merged_text_dict[key][char_start:char_end+1]
            if char_end <= desc_end_idx: 
                keyword_or_tag_relation = 'hasKeyword'
            else:
                keyword_or_tag_relation = 'hasTag'
            DBpediaURL = item['DBpediaURL']
            BabelNetURL = item['BabelNetURL']
            triples_list.append(((key, keyword_or_tag_relation, word), 
                                (DBpediaURL, BabelNetURL)))
    else:
        data_list = babelnet_dict[key].json()
        for item in data_list:
            char_start = item['charFragment']['start']
            char_end = item['charFragment']['end']
            word = merged_text_dict[key][char_start:char_end+1]
            keyword_or_tag_relation = 'hasKeyword'
            DBpediaURL = item['DBpediaURL']
            BabelNetURL = item['BabelNetURL']
            triples_list.append(((key, keyword_or_tag_relation, word), 
                                (DBpediaURL, BabelNetURL)))

In [315]:
pickle_out('triples_list', triples_list)

In [316]:
len(triples_list)

976295

In [317]:
word_set = []
for item in triples_list:
    word_set.append((item[0][2], item[1]))
    
word_set = list(set(word_set))

In [321]:
pickle_out('word_set', word_set)

In [322]:
len(word_set)

84590

In [361]:
p = 'relation'
with open('triple_files/dbpedia_relation.txt', 'w') as f:
    for item in word_set:
        word = item[0]
        dbpedia_url = item[1][0]
        if len(dbpedia_url) > 0:
            dbpedia_entity = dbpedia_url.split('/')[-1]
            f.write(word + '\t' + p + '\t' + dbpedia_entity + '\n')

# Synset

In [3]:
from nltk.corpus import wordnet as wn

In [None]:
"""
Create a Lemma from a "<word>.<pos>.<number>.<lemma>" string where:
<word> is the morphological stem identifying the synset
<pos> is one of the module attributes ADJ, ADJ_SAT, ADV, NOUN or VERB
<number> is the sense number, counting from 0.
<lemma> is the morphological form of interest

Note that <word> and <lemma> can be different, e.g. the Synset
'salt.n.03' has the Lemmas 'salt.n.03.salt', 'salt.n.03.saltiness' and
'salt.n.03.salinity'.
"""

In [358]:
synset_relation = 'synset_member'
hyperonym_relation = 'hypernym'
wordnet_triples_list = []
# phase 1
for word_set_item in word_set:
    word = word_set_item[0]
    synsets = wn.synsets(word)
    for synset_mem in synsets:
        synset_name = synset_mem.name()
        wordnet_triples_list.append((word, synset_relation, synset_name))
        
        # phase 2
        hypernym_set = synset_mem.hypernyms()
        for hypernym_mem in hypernym_set:
            hypernym_name = hypernym_mem.name()
            wordnet_triples_list.append((synset_name, hyperonym_relation, hypernym_name))
            
            # phase 3
            hypernym_set_d2 = hypernym_mem.hypernyms()
            for hypernym_mem_d2 in hypernym_set_d2:
                hypernym_name_d2 = hypernym_mem_d2.name()
                wordnet_triples_list.append((hypernym_name, hyperonym_relation, hypernym_name_d2))

In [422]:
with open('./triple_files/wordnet_relation.txt', 'w') as f:
    for s, p, o in wordnet_triples_list:
        f.write(s + '\t' + p + '\t' + o + '\n')

In [433]:
wordnet_triples_list[0]

('aisle', 'synset_member', 'aisle.n.01')

# Integrating the triples

In [363]:
lastfm_dataset = '/Users/ian/Documents/GitHub/dsci558/project/dataset/KGRec-dataset/'
user_interaction_file = 'implicit_lf_dataset.csv'

In [393]:
with open(lastfm_dataset + user_interaction_file, 'r') as f:
    lines = f.readlines()
    with open('./triple_files/interaction.txt', 'w') as wf:
        for line in lines:
            items = line.split('\t')
            user = 'user_' + items[0]
            relation = 'listened'
            song = 'song_' + items[1]
            wf.write(user + '\t' + relation + '\t' + song + '\n')

In [380]:
with open('./triple_files/tag_keyword_relation.txt', 'w') as f:
    for item in triples_list:
        song = 'song_' + str(item[0][0])
        relation = item[0][1]
        obj = item[0][2]
        f.write(song + '\t' + relation + '\t' + obj + '\n')

In [412]:
df_dbpedia = pd.read_json('dbpedia_sub_broader.jl', lines=True)

In [413]:
with open('./triple_files/dbpedia_sub_broader.txt', 'w') as f:
    for s, p, o in df_dbpedia.values:
        f.write(s + '\t' + p + '\t' + o + '\n')

In [423]:
df_dbpedia_relation = pd.read_csv('triple_files/dbpedia_relation.txt', 
                         names=['s', 'p', 'o'], delimiter='\t')

df_dbpedia_sub_broader = pd.read_csv('triple_files/dbpedia_sub_broader.txt', 
                         names=['s', 'p', 'o'], delimiter='\t')

df_interaction = pd.read_csv('triple_files/interaction.txt', 
                         names=['s', 'p', 'o'], delimiter='\t')

df_tag_keyword_relation = pd.read_csv('triple_files/tag_keyword_relation.txt', 
                         names=['s', 'p', 'o'], delimiter='\t')

df_wordnet_relation = pd.read_csv('triple_files/wordnet_relation.txt', 
                         names=['s', 'p', 'o'], delimiter='\t')

In [424]:
df_merged = pd.concat([df_dbpedia_relation, df_dbpedia_sub_broader,
                       df_interaction, df_tag_keyword_relation,
                       df_wordnet_relation], ignore_index=True)

In [485]:
container_set = []
for s, p, o in df_merged.values:
    container_set.append((s, p, o))
container_set = set(container_set)

In [None]:
df_kg = pd.DataFrame(container_set, columns=['s', 'p', 'o'])

In [None]:
df_kg = df_kg[~df_kg['s'].isnull()]
df_kg = df_kg[~df_kg['o'].isnull()]
df_kg = df_kg[~(df_kg['p']=='Name')] # 1 non-relevant record removing.
df_kg = df_kg.dropna()

In [486]:
with open('./final_kg_files/kg_triples.txt', 'w') as f:
    for (s, p, o) in zip(df_kg['s'], df_kg['p'], df_kg['o']):
        f.write(str(s) + '\t' + str(p) + '\t' + str(o) + '\n')