In [None]:
import pickle
import pandas as pd
from collections import Counter
import numpy as np
import os
import stellargraph
from stellargraph import StellarGraph 
import multiprocessing
from sklearn.preprocessing import normalize

import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = pickle.load(open('raw/DBLP/transductive_dataset.pkl', 'rb'))
docID_venue = pickle.load(open('raw/DBLP/documentID_venue.pkl', 'rb'))

In [None]:
author_label = pd.DataFrame(columns=['author_id', 'label', 'author_name'])
paper_author = pd.DataFrame(columns=['paper_id', 'author_id'])
paper_conf = pd.DataFrame(columns=['paper_id', 'conf_id'])
paper_term = pd.DataFrame(columns=['paper_id', 'term_id'])
papers = pd.DataFrame(columns=['paper_id', 'paper_title'])
terms = pd.DataFrame(columns=['term_id', 'term'])
confs = pd.DataFrame(columns=['conf_id', 'conf'])

In [None]:
publication_list = ['sigmod', 'vldb', 'icde', 'icdt', 'edbt', 'pods', 'kdd', 'www',
                      'sdm', 'pkdd', 'icdm', 'cikm', 'aaai', 'icml', 'ecml', 'colt',
                      'uai', 'soda', 'focs', 'stoc', 'stacs']

for i, record in enumerate(docID_venue):
    venue = record[1]
    for pub in publication_list:
        if pub in venue.lower():
            docID_venue[i][1] = pub

In [None]:
authors_counter = Counter()
terms_counter = Counter()
for record in dataset:
    paper_id = record[0]
    skillIdx = record[1].todense().nonzero()[1]
    terms_counter.update(skillIdx)
    authorIdx = record[2].todense().nonzero()[1]
    authors_counter.update(authorIdx)
    
    for authorId in authorIdx:
        paper_author = paper_author.append({'paper_id': paper_id, 'author_id': authorId}, ignore_index=True)
    
    for skillId in skillIdx:
        paper_term = paper_term.append({'paper_id': paper_id, 'term_id': skillId}, ignore_index=True)
        
    papers = papers.append({'paper_id': paper_id, 'paper_title': 'na'}, ignore_index=True)

unique_authors_idx = list(authors_counter.keys())
for unique_authors_id in unique_authors_idx:
    author_label = author_label.append({'author_id': unique_authors_id, 'label': -1, 'author_name': 'na'}, ignore_index=True)
    
unique_terms_idx = list(terms_counter.keys())
for unique_terms_id in unique_terms_idx:
    terms = terms.append({'term_id': unique_terms_id, 'term': 'na'}, ignore_index=True)
    
conf_counter = Counter()
for record in docID_venue:
    paper_id = record[0]
    conf_counter.update([record[1]])
venues = list(conf_counter.keys())

conf_confID = {}
for i, venue in enumerate(venues):
    confs = confs.append({'conf_id': i, 'conf': venue}, ignore_index=True)
    conf_confID.update({venue: i})
    
for record in docID_venue:
    paper_id = record[0]
    conf_id = conf_confID[record[1]]
    paper_conf = paper_conf.append({'paper_id': paper_id, 'conf_id': conf_id}, ignore_index=True)

In [None]:
authors_counter = Counter()
terms_counter = Counter()
author_size_counter = Counter()
for record in dataset:
    paper_id = record[0]
    skillIdx = record[1].todense().nonzero()[1]
    terms_counter.update(skillIdx)
    authorIdx = record[2].todense().nonzero()[1]
    authors_counter.update(authorIdx)
    author_size_counter.update([record[2].todense().shape[1]])

In [None]:
authors = author_label['author_id'].to_list()
paper_author = paper_author[paper_author['author_id'].isin(authors)].reset_index(drop=True)
valid_papers = paper_author['paper_id'].unique()
print('Number of papers :', len(valid_papers))

papers = papers[papers['paper_id'].isin(valid_papers)].reset_index(drop=True)
paper_conf = paper_conf[paper_conf['paper_id'].isin(valid_papers)].reset_index(drop=True)
print('Number of papers :', len(paper_conf))

paper_term = paper_term[paper_term['paper_id'].isin(valid_papers)].reset_index(drop=True)
valid_terms = paper_term['term_id'].unique()
terms = terms[terms['term_id'].isin(valid_terms)].reset_index(drop=True)


author_label = author_label.sort_values('author_id').reset_index(drop=True)
papers = papers.sort_values('paper_id').reset_index(drop=True)
terms = terms.sort_values('term_id').reset_index(drop=True)
confs = confs.sort_values('conf_id').reset_index(drop=True)

print('Number of conferences ', len(confs))
print('Number of authors ', len(author_label))
print('Number of terms ', len(terms))
print('Number of papers ', len(papers))

authors_list = list(author_label['author_id'])
papers_list = list(papers['paper_id'])
term_list = list(terms['term_id'])
conf_list = list(confs['conf_id'])
dim = len(authors_list) + len(papers_list) + len(term_list) + len(confs)
print(' Total entities :: ', dim)


author_id_mapping = {row['author_id']: i for i, row in author_label.iterrows()}
paper_id_mapping = {row['paper_id']: i + len(author_label) for i, row in papers.iterrows()}
term_id_mapping = {row['term_id']: i + len(author_label) + len(papers) for i, row in terms.iterrows()}
conf_id_mapping = {row['conf_id']: i + len(author_label) + len(papers) + len(terms) for i, row in confs.iterrows()}


entity_id_map = pd.DataFrame(
    columns=['domain', 'entity_id','serial_id']
)
type_dict = { 'author': author_id_mapping, 'paper': paper_id_mapping, 'term': term_id_mapping, 'conf': conf_id_mapping }
for _type,_dict in type_dict.items():
    i = list(_dict.keys())
    j = list(_dict.values())
    _df = pd.DataFrame( data = {'entity_id': i ,'serial_id': j } )
    _df['domain'] = _type
    entity_id_map = entity_id_map.append(_df, ignore_index=True)

    
# ======================================================
# Save data
# ======================================================
data_save_path = 'processed_data_metapath2vec/'
if not os.path.exists('processed_data_metapath2vec'):
    os.mkdir('processed_data_metapath2vec')
if not os.path.exists(data_save_path):
    os.mkdir(data_save_path)
entity_id_map.to_csv( os.path.join( data_save_path, 'entity_id_mapping.csv') ) 

# Create graph data
nodes_author_df = pd.DataFrame( data = { 'author' : list(author_id_mapping.values()) })
nodes_paper_df = pd.DataFrame(  data = { 'paper' : list(paper_id_mapping.values()) } )
nodes_term_df = pd.DataFrame( data = { 'term' : list(term_id_mapping.values()) } )
nodes_conf_df = pd.DataFrame(  data = { 'conf' : list(conf_id_mapping.values()) } )

nodes_author_df.to_csv(os.path.join(data_save_path,'nodes_author.csv'),index = False)
nodes_paper_df.to_csv(os.path.join(data_save_path,'nodes_paper.csv'),index = False)
nodes_term_df.to_csv(os.path.join(data_save_path,'nodes_term.csv'),index = False)
nodes_conf_df.to_csv(os.path.join(data_save_path,'nodes_conf.csv'),index = False)

PA_edge_list = []
for _, row in paper_author.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = author_id_mapping[row['author_id']]
    PA_edge_list.append((idx1,idx2))
    
df = pd.DataFrame ( data =  np.array(PA_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PA_edges.csv')
df.to_csv(fpath, index=False)
    
PT_edge_list = []
for _, row in paper_term.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = term_id_mapping[row['term_id']]
    PT_edge_list.append((idx1,idx2))

df = pd.DataFrame ( data =  np.array(PT_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PT_edges.csv')
df.to_csv(fpath, index=False)
    

PC_edge_list = []
for _, row in paper_conf.iterrows():
    idx1 = paper_id_mapping[row['paper_id']]
    idx2 = conf_id_mapping[row['conf_id']]
    PC_edge_list.append((idx1,idx2))

df = pd.DataFrame ( data = np.array(PC_edge_list), columns = ['source','target'])
fpath = os.path.join(data_save_path, 'PC_edges.csv')
df.to_csv(fpath, index=False)

In [None]:
ap_counter = Counter()
tp_counter = Counter()
pc_counter = Counter()

for i, _ in PA_edge_list:
    ap_counter.update([i])
print(np.mean(list(ap_counter.values())))

for i, _ in PT_edge_list:
    tp_counter.update([i])
print(np.mean(list(tp_counter.values())))

for _, i in PC_edge_list:
    pc_counter.update([i])
print(np.mean(list(pc_counter.values())))

In [None]:
# ==============================
# Create data for HIN2Vec
# ==============================

df = pd.DataFrame(columns=['node1', 'node2','rel'])
for edge in PA_edge_list:
    df = df.append({'node1':edge[0],'node2':edge[1],'rel': 0},ignore_index=True )

for edge in PT_edge_list:
    df = df.append({'node1':edge[0],'node2':edge[1],'rel': 1},ignore_index=True )
    
for edge in PC_edge_list:
    df = df.append({'node1':edge[0],'node2':edge[1],'rel': 2},ignore_index=True )

In [None]:
df['node1'] = df['node1'].astype(int)
df['node2'] = df['node2'].astype(int)
df['rel'] = df['rel'].astype(int)
fpath = os.path.join(data_save_path,'hin2vec_dblp_input.txt')
df.to_csv( fpath, index = None, sep=',')

In [None]:
src_dir = './processed_data_metapath2vec/'

nodes_author_df = pd.read_csv(
    os.path.join(
        src_dir,
        'nodes_author.csv'),
    index_col = 0
)
nodes_paper_df = pd.read_csv(
    os.path.join(
        src_dir,
        'nodes_paper.csv'),
    index_col = 0
)
nodes_term_df = pd.read_csv(
    os.path.join(
        src_dir,
        'nodes_term.csv'),
    index_col = 0
)
nodes_conf_df = pd.read_csv(
    os.path.join(
        src_dir,
        'nodes_conf.csv'),
    index_col = 0
)

fpath_list = ['PT_edges.csv','PC_edges.csv','PA_edges.csv']
df_edges = None
for fpath in fpath_list:
    _df = pd.read_csv( os.path.join(src_dir,fpath), index_col = None )
    if df_edges is None : df_edges = _df
    else:
        df_edges = df_edges.append(_df,ignore_index= True)

In [None]:
graph_obj = StellarGraph({
    "author": nodes_author_df,
    "paper": nodes_paper_df,
    "term": nodes_term_df,
    "conf": nodes_conf_df
},
    df_edges
)

In [None]:
print(graph_obj.info())

In [None]:
model_use_data_DIR = 'model_use_data'
if not os.path.exists(model_use_data_DIR):
    os.mkdir(model_use_data_DIR)
model_use_data_DIR = os.path.join(model_use_data_DIR,'DBLP')
if not os.path.exists(model_use_data_DIR):
    os.mkdir(model_use_data_DIR)

In [None]:
walk_length = 64  # maximum length of a random walk to use throughout this notebook

# specify the metapath schemas as a list of lists of node types.
metapaths = [
    ["author", "paper", "author"],
    ["author", "paper", "term", "paper", "author"],
    ["author", "paper", "conf", "paper", "author"]
]

In [None]:
from stellargraph.data import UniformRandomMetaPathWalk

# Create the random walker
rw = UniformRandomMetaPathWalk(graph_obj)

import time
start = time.time()
walks = rw.run(
    nodes=list(graph_obj.nodes()),  # root nodes
    length=walk_length,  # maximum length of a random walk
    n=40,  # number of random walks per root node
    metapaths=metapaths,  # the metapaths
)
end = time.time()
print("Time taken: ", end - start)
print("Number of random walks: {}".format(len(walks)))

In [None]:
for walk in walks:
    for i in range(len(walk)):
        walk[i] = str(walk[i])

In [None]:
import keras
import time
class TimeHistory(keras.callbacks.Callback):
    
    def on_train_begin(self, logs={}):
        self.times = []

    def on_epoch_begin(self, batch, logs={}):
        self.epoch_time_start = time.time()

    def on_epoch_end(self, batch, logs={}):
        self.times.append(time.time() - self.epoch_time_start)
        
from gensim.models.callbacks import CallbackAny2Vec
class callback(CallbackAny2Vec):
    '''Callback to print loss after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        print('Loss after epoch {}: {}'.format(self.epoch, loss))
        self.epoch += 1

In [None]:
from gensim.models import Word2Vec
time_callback = TimeHistory()
model = Word2Vec(walks, size=100, window=5, min_count=0, sg=1, workers=multiprocessing.cpu_count(), negative=1, compute_loss=True, iter=100, callbacks=[callback(), time_callback])

In [None]:
times = time_callback.times

In [None]:
cumulative_times = times

In [None]:
for i in range(len(cumulative_times)):
    if(i>0):
        cumulative_times[i] = cumulative_times[i]+cumulative_times[i-1]

In [None]:
time_per_epoch = []
for i in range(1,101):
    time_per_epoch.append((i,cumulative_times[i-1]))

In [None]:
df_time_per_epoch = pd.DataFrame(time_per_epoch)

In [None]:
df_time_per_epoch.to_csv("metapath2vec_time_per_epoch.csv")

In [None]:
node_embeddings = []
model_save_path = 'model_save_dir'
if not os.path.exists(model_save_path):
    os.mkdir(model_save_path)
model_save_path = os.path.join(model_save_path,'DBLP')
if not os.path.exists(model_save_path):
    os.mkdir(model_save_path)
emb_fpath = os.path.join(model_save_path, 'mp2v_{}_{}_{}.npy'.format(128,40,100))
np.save(emb_fpath, node_embeddings )

# ======== Save node weights ============ #
for i in range(len(graph_obj.nodes())):
    vec = model.wv[str(i)]
    node_embeddings.append(vec)
node_embeddings = np.array(node_embeddings)
np.save(emb_fpath, node_embeddings)

In [None]:
entity_id_mapping = pd.read_csv(src_dir + 'entity_id_mapping.csv')

In [None]:
node_embeddings_normalized = normalize(node_embeddings, axis=1, norm='l2')

In [None]:
embedding_dict = {'user': {}, 'skill': {}}
for i, row in entity_id_mapping.iterrows():
    if row['domain'] == 'author':
        embedding_dict['user'].update({row['entity_id']: node_embeddings_normalized[row['serial_id']]})  
    elif row['domain'] == 'term':
        embedding_dict['skill'].update({row['entity_id']: node_embeddings_normalized[row['serial_id']]}) 
pickle.dump(embedding_dict, open(model_save_path + '/embedding_dict.pkl', 'wb'))